In [1]:
import os
from os.path import join
import copy
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import sklearn

import matplotlib.pyplot as plt

adult_path = join('data', 'adult_data.csv')
column_path = join('data', 'adult_names.txt')

adult_columns = []
for ac in open(column_path):
    adult_columns = ac.split() 

In [2]:
data = pd.read_csv(adult_path, names=adult_columns)
label = data['income']
del data['income']
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [3]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
age               48842 non-null int64
workclass         48842 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    48842 non-null object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


In [5]:
data = pd.get_dummies(data)
label = label.map(lambda x : 0 if x =='>50K' else 1)

### Cross Validation 

In [6]:
from sklearn.model_selection import train_test_split
x, x_test, y, y_test = train_test_split(data, label, test_size=0.2, stratify=label, shuffle=True, random_state=1905)

In [7]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, stratify=y, shuffle=True, random_state=1905)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(random_state=1905)
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1905, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
y_pred_val = lr.predict(x_valid)
print("Validation Accuracy Score : {:.2f}".format(accuracy_score(y_valid,y_pred_val)))

Validation Accuracy Score : 0.80


In [10]:
y_pred = lr.predict(x_test)
print("Validation Accuracy Score : {:.2f}".format(accuracy_score(y_test,y_pred)))

Validation Accuracy Score : 0.79


#### k-fold with stratify 

In [11]:
from sklearn.datasets import load_iris
iris = load_iris()

kf_data = iris.data
kf_label = iris.target
kf_columns = iris.feature_names

In [12]:
kf_data = pd.DataFrame(kf_data, columns=kf_columns)

In [13]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1905)

In [14]:
for i, (trn_idx, val_idx) in enumerate(skf.split(kf_data, kf_label)):
    trn_data, trn_label = kf_data.values[trn_idx, :], kf_label[trn_idx]
    val_data, val_label = kf_data.values[val_idx, :], kf_label[val_idx]
    
    print("{} Fold, trn label\n{}".format(i, trn_label))
    print("{} Fold, val label\n{}".format(i, val_label))

0 Fold, trn label
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2]
0 Fold, val label
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]
1 Fold, trn label
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2]
1 Fold, val label
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]
2 Fold, trn label
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2]
2 Fold, val label
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
val_scores = []

for i, (trn_idx, val_idx) in enumerate(skf.split(kf_data, kf_label)):
    trn_data, trn_label = kf_data.values[trn_idx, :], kf_label[trn_idx]
    val_data, val_label = kf_data.values[val_idx, :], kf_label[val_idx]
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1905)
    
    clf.fit(trn_data, trn_label)
    
    trn_acc = clf.score(trn_data, trn_label)
    val_acc = clf.score(val_data, val_label)
    print("{} Fold, train Accuracy : {:.2f}, validation Accuracry : {:.2f}".format(i, trn_acc, val_acc))
    
    val_scores.append(val_acc)
    
print("Cross Validation Score : {:.2f}".format(np.mean(val_scores)))

0 Fold, train Accuracy : 1.00, validation Accuracry : 0.93
1 Fold, train Accuracy : 1.00, validation Accuracry : 1.00
2 Fold, train Accuracy : 1.00, validation Accuracry : 1.00
3 Fold, train Accuracy : 1.00, validation Accuracry : 0.90
4 Fold, train Accuracy : 1.00, validation Accuracry : 0.97
Cross Validation Score : 0.96


#### Cross Validation Score 

In [17]:
from sklearn.model_selection import cross_val_score

In [21]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1905)
print("Random Forest k-fold CV score : {}".format(cross_val_score(rf, kf_data, kf_label, cv=skf)))

Random Forest k-fold CV score : [0.93333333 1.         1.         0.9        0.96666667]


### Parameter Tuning 

#### Grid Search

In [22]:
from sklearn.model_selection import GridSearchCV

In [25]:
params = {"n_estimators" : [50, 100, 150, 200], "max_depth" : [5, 10, 15, 20], "min_samples_split" :[2, 5, 10]}

clf = GridSearchCV(RandomForestClassifier(), params, cv=skf)

In [26]:
clf

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1905, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
         

In [28]:
clf.fit(kf_data, kf_label)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1905, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
         

In [29]:
print("GridSearch CV best score : {:.2f}, best_params : {}".format(clf.best_score_, clf.best_params_))

GridSearch CV best score : 0.97, best_params : {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}


#### Scikit-Optimaize

### Ensemble

#### Voting Ensemble

In [31]:
from sklearn.neural_network import MLPClassifier

In [32]:
from sklearn.ensemble import VotingClassifier
clfs = [("LR", LogisticRegression()),("RF", RandomForestClassifier(max_depth=5)),("MLP", MLPClassifier())]

vote_clf=VotingClassifier(clfs)

In [33]:
vote_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('RF',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',...
                                        

In [34]:
print('Cross Validation Acc : {:.2f}'.format(vote_clf.score(x_valid, y_valid)))

Cross Validation Acc : 0.82


In [35]:
y_pred = vote_clf.predict(x_test)

In [36]:
print("Voting Ensemble Acc : {:.2f}".format(vote_clf.score(x_test, y_test)))

Voting Ensemble Acc : 0.82


#### Bagging, Average Blending 

In [37]:
clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=1905)
clf.fit(x_train, y_train)
print("Random Forest Acc : {:.2f}".format(clf.score(x_test, y_test)))

Random Forest Acc : 0.84


In [39]:
val_scores = []
y_pred = np.zeros_like(y_test, dtype=np.float)

for i, (trn_idx, val_idx) in enumerate(skf.split(x, y)):
    trn_data, trn_label = x.values[trn_idx, :], y.values[trn_idx]
    val_data, val_label = x.values[val_idx, :], y.values[val_idx]
    
    clf = RandomForestClassifier(n_estimators=50, max_depth=5,random_state=1905)
    
    clf.fit(trn_data, trn_label)
    trn_acc = clf.score(trn_data, trn_label)
    val_acc = clf.score(val_data, val_label)
    print("{} Fold, train accuracy : {:.2f}, validation accuracy : {:.2f}".format(i, trn_acc, val_acc))
    
    val_scores.append(val_acc)
    y_pred += (clf.predict_proba(x_test)[:,1]/skf.n_splits)
    
print("Cross Validation Score : {:.2f}".format(np.mean(val_scores)))

0 Fold, train accuracy : 0.84, validation accuracy : 0.84
1 Fold, train accuracy : 0.84, validation accuracy : 0.84
2 Fold, train accuracy : 0.84, validation accuracy : 0.84
3 Fold, train accuracy : 0.84, validation accuracy : 0.84
4 Fold, train accuracy : 0.84, validation accuracy : 0.84
Cross Validation Score : 0.84


In [40]:
y_pred = [0 if y < 0.5 else 1 for y in y_pred]
print("Average Blending Acc : {:.2f}".format(accuracy_score(y_test, y_pred)))

Average Blending Acc : 0.84
