In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
import numpy as np

In [4]:
train = pd.read_csv('sf04classification/train.csv')

In [5]:
test = pd.read_csv('sf04classification/test.csv')

In [6]:
y = train['target']

In [7]:
train = train.drop('target', axis=1)

In [15]:
def get_duration(d, df):

    return {
        (100 <= d <= 700) : True,
        (d < 100) | (700 < d): False
    }[True]


    
def preproc_data(df_input):
    df_output = df_input.copy()
    
    df_output = df_output.drop('_id', axis=1)
    
    months = ['jan', 'feb', 'mar', 'apr', 'may', 'jul', 'jun', 'aug', 'sep', 'oct', 'nov','dec']
    df_output['kvartal'] = df_output['month'].apply(lambda m : months.index(m)/3+1)
    
    job = list(set(df_output['job']))
    df_output['job'] = df_output.job.apply(lambda r: job.index(r))

    month = list(set(df_output['month']))
    df_output['month'] = df_output.month.apply(lambda r: month.index(r))   
    
    day_of_week = list(set(df_output['day_of_week']))
    df_output['day_of_week'] = df_output.day_of_week.apply(lambda r: day_of_week.index(r)) 
    df_output['work_day'] = df_output.day_of_week.apply(lambda r: r < 6) 


    df_output['pdays'] = df_output['pdays'].apply(lambda d: 0 if d == 999 else d)
    df_output['duration_class'] = df_output['duration'].apply(lambda d: get_duration(d, df_output))



    df_output['duration_interrupted'] = df_output['duration'].apply(lambda d: 0 if d < 30 else d)

    df_output = pd.get_dummies(df_output, columns=[

         'marital',
         'education',
         'default',
         'housing',
         'loan',
         'contact',
         'poutcome'
    ])
    
    return df_output

In [16]:
# train_work = preproc_data(train)
# test_work = preproc_data(test)

train_work = preproc_data(train)
test_work = preproc_data(test)

In [32]:
train_work.head(10)

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
0,54,87,1,999,0,1.4,93.444,-36.1,4.963,5228.1,...,0,0,0,0,0,0,0,0,1,0
1,36,291,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,0,0
2,53,182,1,999,0,1.4,93.444,-36.1,4.965,5228.1,...,0,0,0,0,0,0,1,0,0,0
3,34,180,2,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,0,0,0,1
4,29,6,1,999,0,-1.7,94.027,-38.3,0.89,4991.6,...,0,0,0,0,0,1,0,0,0,0
5,56,64,2,999,0,1.4,94.465,-41.8,4.961,5228.1,...,0,0,0,0,0,0,0,1,0,0
6,33,312,2,999,0,-1.8,93.876,-40.0,0.685,5008.7,...,0,1,0,0,0,0,1,0,0,0
7,36,20,1,999,1,-0.1,93.2,-42.0,4.12,5195.8,...,0,0,1,0,0,0,0,0,0,1
8,70,585,1,6,3,-1.1,94.601,-49.5,0.987,4963.6,...,0,0,0,1,0,0,0,1,0,0
9,32,196,2,999,1,-1.8,92.893,-46.2,1.327,5099.1,...,0,1,0,0,0,0,0,1,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_work, y, test_size=0.33, random_state=42)

In [20]:

dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

lr = LogisticRegression()
#lr = LogisticRegression(n_jobs=-1, class_weight='balanced', solver='newton-cg', multi_class='multinomial')
lr.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
roc_auc_score(y_test, dtc.predict_proba(X_test)[:,1])

0.8608413143433891

In [22]:
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.9236176591529289

In [23]:
roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])

0.8141768905835296

In [24]:
from sklearn.metrics import log_loss
log_loss(y_test, dtc.predict_proba(X_test)[:,1])

1.0849611429070682

In [25]:
log_loss(y_test, lr.predict_proba(X_test)[:,1])

0.21986313063311258

In [26]:
log_loss(y_test, knn.predict_proba(X_test)[:,1])

1.504172146448717

In [27]:
train_work_tmp = train_work.copy()
train_work_tmp['dtc'] = dtc.predict_proba(train_work)[:,1]


train_work_tmp['rf'] = rf.predict_proba(train_work)[:,1]
train_work = train_work_tmp

test_work_tmp = test_work.copy()
test_work_tmp['dtc'] = dtc.predict_proba(test_work)[:,1]


test_work_tmp['rf'] = rf.predict_proba(test_work)[:,1]
test_work = test_work_tmp

In [28]:
X_train, X_test, y_train, y_test = train_test_split(train_work, y, test_size=0.33, random_state=42)
lr.fit(X_train, y_train)
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.9395829688609774

In [29]:
result_predict = pd.DataFrame(data={'_id': test['_id'], 'target': lr.predict_proba(test_work)[:,1]})

In [30]:
result_predict_test = pd.DataFrame(data={'_id': test['_id'], 'target': lr.predict(test_work)})

In [31]:
result_predict_test[result_predict_test['target'] == 1]['target'].agg('count')

1495

In [32]:
result_predict.to_csv('submit_lr.csv', index=False)