- Import package

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import sqlite3
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE

- Data load and Under sampling to avoid imbalanced problem

In [None]:
df = pd.read_excel('var.xlsx')
x,y = RandomUnderSampler(random_state=0).fit_sample(df.iloc[:,3:],df.iloc[:,2])
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y)

- KNN, Logistic
    - 정규화를 하지 않는 편이 성능이 더 좋다.

In [None]:
pipe1 = Pipeline([('mm',MinMaxScaler())])
pipe2 = Pipeline([('ss',StandardScaler())])
ct1 = ColumnTransformer([('mm', pipe1, ['left_branching_entropy','right_branching_entropy','left_accessor_variety','right_accessor_variety','leftside_frequency','rightside_frequency'])])
ct2 = ColumnTransformer([('ss', pipe2, ['left_branching_entropy','right_branching_entropy','left_accessor_variety','right_accessor_variety','leftside_frequency','rightside_frequency'])])

In [None]:
pipe1 = Pipeline([('mm',MinMaxScaler())])
pipe2 = Pipeline([('ss',StandardScaler())])
ct1 = ColumnTransformer([('mm', pipe1, ['left_branching_entropy','right_branching_entropy','left_accessor_variety','right_accessor_variety','leftside_frequency','rightside_frequency'])])
ct2 = ColumnTransformer([('ss', pipe2, ['left_branching_entropy','right_branching_entropy','left_accessor_variety','right_accessor_variety','leftside_frequency','rightside_frequency'])])
final_pipe1 = Pipeline([('ct',ct1),('clf',KNeighborsClassifier())])
final_pipe2 = Pipeline([('ct',ct2),('clf',KNeighborsClassifier())])
ns_pipe = Pipeline([('clf',KNeighborsClassifier())])
grid1 = GridSearchCV(final_pipe1, [{'clf':[KNeighborsClassifier()], 'clf__n_neighbors':range(2,8)},
                          {'clf':[LogisticRegression()], 'clf__penalty':['l1','l2'],'clf__C':[0.1,1,10,100,1000], 'clf__solver':['liblinear','sag','saga','lbfgs']}])
grid2 = GridSearchCV(final_pipe2, [{'clf':[KNeighborsClassifier()], 'clf__n_neighbors':range(2,8)},
                          {'clf':[LogisticRegression()], 'clf__penalty':['l1','l2'],'clf__C':[0.1,1,10,100,1000], 'clf__solver':['liblinear','sag','saga','lbfgs']}])
grid3 = GridSearchCV(ns_pipe, [{'clf':[KNeighborsClassifier()], 'clf__n_neighbors':range(2,8)},
                          {'clf':[LogisticRegression()], 'clf__penalty':['l1','l2'],'clf__C':[0.1,1,10,100,1000], 'clf__solver':['liblinear','sag','saga','lbfgs']}])

In [None]:
grid1.fit(x, y)
temp1 = pd.DataFrame(grid1.cv_results_).T
temp1.to_excel('mm_var.xlsx')
grid2.fit(x, y)
temp2 = pd.DataFrame(grid2.cv_results_).T
temp2.to_excel('ss_var.xlsx')
grid3.fit(x, y)
temp3 = pd.DataFrame(grid3.cv_results_).T
temp3.to_excel('ns_var.xlsx')

In [None]:
grid3.best_estimator_ # grid1.best_estimator_, grid2.best_estimator_

In [None]:
pipe = Pipeline(steps=[('clf', LogisticRegression(C=100, solver='liblinear'))])
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
rfe = RFE(LogisticRegression(C=100, solver='liblinear'))
rfe.fit(X_train, y_train)

In [None]:
y_pred = rfe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

- DecisionTree
    - RFE를 쓰지 않는 편이 성능이 더 좋다.

In [None]:
ns_pipe = Pipeline([('clf',DecisionTreeClassifier())])
grid = GridSearchCV(ns_pipe, [{'clf':[DecisionTreeClassifier()], 'clf__criterion':['gini','entropy'], 'clf__max_depth':range(4,20), 'clf__min_samples_split':range(2,6), 'clf__min_samples_leaf':range(2,4), 'clf__random_state':[0,13]}])

In [None]:
grid.fit(x, y)

In [None]:
grid.best_estimator_

In [None]:
pipe = Pipeline(steps=[('clf',
                 DecisionTreeClassifier(max_depth=6, min_samples_leaf=2,
                                        random_state=13))])
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
rfe = RFE(DecisionTreeClassifier(criterion='entropy', max_depth=5,
                                        min_samples_leaf=2, random_state=0))
rfe.fit(X_train, y_train)

In [None]:
y_pred = rfe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.model_selection import learning_curve
import sklearn_evaluation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=0)
train_size, train_score, test_score = learning_curve(DecisionTreeClassifier(criterion='entropy', max_depth=5,
                                        min_samples_leaf=2, random_state=0), x, y, cv = kfold)
sklearn_evaluation.plot.learning_curve(train_score, test_score, train_size)

- RandomForest

In [None]:
ns_pipe = Pipeline([('clf',RandomForestClassifier())])
grid = GridSearchCV(ns_pipe, [{'clf':[RandomForestClassifier()], 'clf__n_estimators':[100,200,300], 'clf__max_depth':[3,5,7,9], 'clf__min_samples_leaf':[2,3,5,7,9], 'clf__min_samples_split':[3,5,7,9], 'clf__random_state':[0,13]}])

In [None]:
grid.fit(x, y)

In [None]:
grid.best_estimator_

In [None]:
pipe = Pipeline(steps=[('clf',
                 RandomForestClassifier(max_depth=9, min_samples_leaf=2,
                                        min_samples_split=9, n_estimators=300,
                                        random_state=0))])
pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
rfe = RFE(RandomForestClassifier(max_depth=9, min_samples_leaf=2,
                                        min_samples_split=9, n_estimators=300,
                                        random_state=0))
rfe.fit(X_train, y_train)

In [None]:
y_pred = rfe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.model_selection import learning_curve
import sklearn_evaluation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=0)
train_size, train_score, test_score = learning_curve(RandomForestClassifier(max_depth=9, min_samples_leaf=2,
                                        min_samples_split=9, n_estimators=300,
                                        random_state=0), x, y, cv = kfold)
sklearn_evaluation.plot.learning_curve(train_score, test_score, train_size)

- XGBoosting

In [None]:
ns_pipe = Pipeline([('clf',XGBClassifier())])
grid = GridSearchCV(ns_pipe, [{'clf':[XGBClassifier()], 'clf__max_depth':[3,5,7,9],'clf__n_estimators':[100,500,1000], 'clf__learning_rate':[0.05,0.1,0.15,0.2], 'clf__booster':['gbtree','gblinear'], 'clf__subsample':[0.6,0.8,1.0], 'clf__random_state':[0,13]}])

In [None]:
grid.fit(x, y)

In [None]:
grid.best_estimator_

In [None]:
pipe = Pipeline(memory=None,
         steps=[('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.05,
                               max_delta_step=0, max_depth=5,
                               min_child_weight=1, n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=0.6, tree_method='exact',
                               validate_parameters=1))],
         verbose=False)
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
rfe = RFE(XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.05,
                               max_delta_step=0, max_depth=5,
                               min_child_weight=1, n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=0.6, tree_method='exact',
                               validate_parameters=1))
rfe.fit(X_train, y_train)

In [None]:
y_pred = rfe.predict(X_test)
f1_score(y_test,y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.model_selection import learning_curve
import sklearn_evaluation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=0)
train_size, train_score, test_score = learning_curve(XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.05,
                               max_delta_step=0, max_depth=5,
                               min_child_weight=1, n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=0.6, tree_method='exact',
                               validate_parameters=1), x, y, cv = kfold)
sklearn_evaluation.plot.learning_curve(train_score, test_score, train_size)