In [1]:
import pandas as pd
import numpy as np

from IPython.display import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

df_train = pd.read_csv('sf04classification/train.csv', encoding='utf8', index_col='_id')
df_test = pd.read_csv('sf04classification/test.csv', encoding='utf8', index_col='_id')


def preproc_data(df_input):
    df_output = df_input.copy()
    
    df_output = df_output.drop(['month', 'day_of_week', 'default',
                                'contact', 'pdays', 'nr.employed',
                               'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m'], axis = 1)

    # category code
    df_output = pd.get_dummies(df_output, columns=['job', 'marital', 'education', 'housing', 'loan', 'poutcome'])
        
    return df_output

x = preproc_data(df_train).drop(['target'], axis=1)
y = df_train['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)


def count():
    lr.fit(x, y)
    
    df_test = pd.read_csv('sf04classification/test.csv', encoding='utf8')
    ans = lr.predict(preproc_data(df_test.drop(['_id'], axis=1)))
    result = pd.DataFrame()
    result['_id'] = df_test['_id']
    result['target'] = ans
    result.to_csv('submit.csv', index=False)
    
    !cut -f2 --delimiter="," submit.csv | grep 1 | wc -l



In [None]:
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(x_train, y_train)
roc_auc_score(y_test, dtc.predict_proba(x_test)[:,1])

In [None]:
dtc = DecisionTreeClassifier(max_depth=100, criterion='entropy')
dtc.fit(x_train, y_train)
roc_auc_score(y_test, dtc.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=1000, algorithm="brute", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=2000, algorithm="brute", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=10000, algorithm="brute", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="brute", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="ball_tree")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="ball_tree", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="ball_tree", weights="distance", leaf_size=100)
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="kd_tree")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="kd_tree", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="kd_tree", weights="distance", leaf_size=100)
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="brute")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="brute", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="auto")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=500, algorithm="auto", weights="distance")
knn.fit(x_train, y_train)
roc_auc_score(y_test, knn.predict_proba(x_test)[:,1])

In [None]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(x_train, y_train)
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])


In [None]:
lr = LogisticRegression(n_jobs=-1, penalty="l1")
lr.fit(x_train, y_train)
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])

In [5]:
lr = LogisticRegression(n_jobs=-1, class_weight='balanced')
lr.fit(x_train, y_train)
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])
#3848

  " = {}.".format(self.n_jobs))


0.9200102211622497

In [6]:
lr = LogisticRegression(n_jobs=-1, class_weight='balanced', solver='lbfgs', multi_class='multinomial')
lr.fit(x_train, y_train)
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])
#3985

0.9168367724828536

In [2]:
lr = LogisticRegression(n_jobs=-1, class_weight='balanced', solver='newton-cg', multi_class='multinomial')
lr.fit(x_train, y_train)
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])
#3792
# удаление ивана - вторая попытка.

0.8829766440112904

In [4]:
count()

3737
