In [53]:
import pandas as pd

In [54]:
train = pd.read_csv('sf04classification/train.csv')

In [55]:
test = pd.read_csv('sf04classification/test.csv')

In [56]:
y = train['target']

In [57]:
train = train.drop('target', axis=1)

In [58]:
def getDurationClass(d, df):
    durationStd = df.duration.std()
    durationMin = df.duration.min()
    durationMax = df.duration.max()
    return {
        (100 <= d <= 700) : True,
        (d < 100) | (700 < d): False

    }[True]

def isBad(df):
    return ((df['pdays']==999) | (df['pdays']==0)) & (df['previous']>0)
    
def preproc_data(df_input):
    df_output = df_input.copy()
    
    df_output = df_output.drop('_id', axis=1)
    
    months = ['jan', 'feb', 'mar', 'apr', 'may', 'jul', 'jun', 'aug', 'sep', 'oct', 'nov','dec']
    df_output['kvartal'] = df_output['month'].apply(lambda m : months.index(m)/3+1)
    
    job = list(set(df_output['job']))
    df_output['job'] = df_output.job.apply(lambda r: job.index(r))

    month = list(set(df_output['month']))
    df_output['month'] = df_output.month.apply(lambda r: month.index(r))   
    
    day_of_week = list(set(df_output['day_of_week']))
    df_output['day_of_week'] = df_output.day_of_week.apply(lambda r: day_of_week.index(r)) 
    df_output['work_day'] = df_output.day_of_week.apply(lambda r: r < 6) 

    df_output['pdays'] = df_output['pdays'].apply(lambda d: 0 if d == 999 else d)
    df_output['duration_class'] = df_output['duration'].apply(lambda d: getDurationClass(d, df_output))

    df_output['duration_interrupted'] = df_output['duration'].apply(lambda d: 0 if d < 30 else d)

    df_output = pd.get_dummies(df_output, columns=[
         'marital',
         'education',
         'default',
         'housing',
         'loan',
         'contact',
         'poutcome'
    ])
    
    return df_output

In [None]:
train_work = preproc_data(train)
test_work = preproc_data(test)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_work, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

classifiers = {
    'tree': DecisionTreeClassifier(max_depth=5), 
    'logistic': LogisticRegression(),
    'forest': RandomForestClassifier(),
    'kn': KNeighborsClassifier(n_neighbors=10)
}

In [39]:
import numpy as np
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

lr = LogisticRegression()
#lr = LogisticRegression(n_jobs=-1, class_weight='balanced', solver='newton-cg', multi_class='multinomial')
lr.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, train_work.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature rf (0.434713)
2. feature dtc (0.374431)
3. feature duration_interrupted (0.037456)
4. feature duration (0.030397)
5. feature euribor3m (0.025047)
6. feature nr.employed (0.020137)
7. feature poutcome_success (0.014954)
8. feature pdays (0.014203)
9. feature cons.conf.idx (0.008690)
10. feature cons.price.idx (0.006716)
11. feature emp.var.rate (0.006309)
12. feature month (0.003245)
13. feature age (0.003124)
14. feature kvartal (0.003022)
15. feature duration_class (0.002294)
16. feature previous (0.002100)
17. feature poutcome_nonexistent (0.001533)
18. feature job (0.001481)
19. feature poutcome_failure (0.001256)
20. feature contact_cellular (0.001239)
21. feature campaign (0.001036)
22. feature day_of_week (0.000947)
23. feature contact_telephone (0.000706)
24. feature marital_married (0.000434)
25. feature housing_yes (0.000411)
26. feature education_university.degree (0.000401)
27. feature housing_no (0.000360)
28. feature default_no (0.000346)
29. fe

In [40]:
roc_auc_score(y_test, dtc.predict_proba(X_test)[:,1])

0.7567451684049196

In [41]:
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.9229003775559792

In [42]:
roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])

0.8169720758102501

In [43]:
from sklearn.metrics import log_loss
log_loss(y_test, dtc.predict_proba(X_test)[:,1])

3.0324963551161574

In [44]:
log_loss(y_test, lr.predict_proba(X_test)[:,1])

0.4332052279325089

In [45]:
log_loss(y_test, knn.predict_proba(X_test)[:,1])

1.501744123877543

In [46]:
train_work_tmp = train_work.copy()
train_work_tmp['dtc'] = dtc.predict_proba(train_work)[:,1]


train_work_tmp['rf'] = rf.predict_proba(train_work)[:,1]
train_work = train_work_tmp

test_work_tmp = test_work.copy()
test_work_tmp['dtc'] = dtc.predict_proba(test_work)[:,1]


test_work_tmp['rf'] = rf.predict_proba(test_work)[:,1]
test_work = test_work_tmp

In [52]:
X_train, X_test, y_train, y_test = train_test_split(train_work, y, test_size=0.33, random_state=42)
lr.fit(X_train, y_train)
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.9282150199992524

In [50]:
log_loss(y_test, lr.predict_proba(X_test)[:,1])

0.39177594646622066

In [51]:
result_predict = pd.DataFrame(data={'_id': test['_id'], 'target': lr.predict_proba(test_work)[:,1]})

In [38]:
result_predict.to_csv('submit.csv', index=False)