In [66]:
import pandas as pd
import numpy as np
import copy, time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [67]:
# 讀取資料
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_features.csv')

ids = df_test['name']
train_Y = df_train['poi']
df_train = df_train.drop(['name','poi'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value,name
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,...,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0,
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,...,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0,
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,...,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0,
3,800000.0,,,,michael.kopper@enron.com,,118134.0,,,,...,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0,
4,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,25.0,...,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0,


In [68]:
# 檢查欄位缺空
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
name,77.39726
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_this_person_to_poi,41.09589
from_messages,41.09589


In [69]:
# 缺空值太多 不考慮此feature
drop_features = ['loan_advances','director_fees','restricted_stock_deferred']
for i in drop_features:
    df = df.drop([i], axis=1)
na_check(df)

Unnamed: 0,Missing Ratio
name,77.39726
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
shared_receipt_with_poi,41.09589
from_poi_to_this_person,41.09589
from_messages,41.09589
to_messages,41.09589
from_this_person_to_poi,41.09589


In [70]:
# 補平均值 去除偏態
mean_features = ['deferral_payments','deferred_income','long_term_incentive','bonus','from_messages','from_poi_to_this_person','from_this_person_to_poi','shared_receipt_with_poi','to_messages','other','salary','expenses','exercised_stock_options','restricted_stock','total_payments','total_stock_value']
for j in mean_features:
    df[j] = df[j].map(lambda i: np.log(i) if i > 0 else 0)
na_check(df)

Unnamed: 0,Missing Ratio
name,77.39726
email_address,23.972603


In [71]:
# 捨棄 Name, Email 欄位
df.drop(labels = ["name","email_address"], axis = 1, inplace = True)

In [72]:
# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)

# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

# 使用三種模型 : 邏輯斯迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=250, max_features=10,
                                  max_depth=6, learning_rate=0.03)

In [73]:
# 線性迴歸預測檔
lr.fit(train_X, train_Y)
lr_pred = lr.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': ids, 'poi': lr_pred})
sub.to_csv('enron_lr.csv', index=False) 

In [74]:
# 梯度提升機預測檔 
gdbt.fit(train_X, train_Y)
gdbt_pred = gdbt.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': ids, 'poi': gdbt_pred})
sub.to_csv('enron_gdbt.csv', index=False)

In [76]:
# 隨機森林預測檔
rf.fit(train_X, train_Y)
rf_pred = rf.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': ids, 'poi': rf_pred})
sub.to_csv('enron_rf.csv', index=False)

In [77]:
# 混合
blending_pred = lr_pred*0.3  + gdbt_pred*0.3 + rf_pred*0.4
sub = pd.DataFrame({'name': ids, 'poi': blending_pred})
sub.to_csv('enron_blending.csv', index=False)