In [81]:
import numpy as np
import pandas as pd
import xgboost as xgb
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score, accuracy_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/kakr-4th-competition/train.csv
/kaggle/input/kakr-4th-competition/test.csv
/kaggle/input/kakr-4th-competition/sample_submission.csv


In [82]:
train_df = pd.read_csv('../input/kakr-4th-competition/train.csv')
test_df = pd.read_csv('../input/kakr-4th-competition/test.csv')

In [83]:
train_df.drop(['id'], axis=1, inplace=True)
test_df.drop(['id'], axis=1, inplace=True)

In [84]:
y = train_df['income'] != '<=50K'
X = train_df.drop(['income'], axis=1)

In [85]:
# 라벨 인코더 생성
LE_encoder = OrdinalEncoder(list(X.columns))

# train, test 데이터에 인코딩 적용
X = LE_encoder.fit_transform(X, y)
test_df = LE_encoder.transform(test_df)

In [86]:
X['income'] = y
X.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,40,1,168538,1,9,1,1,1,1,1,0,0,60,1,True
1,17,1,101626,2,5,2,2,2,1,1,0,0,20,1,False
2,18,1,353358,3,10,2,3,2,1,1,0,0,16,1,False
3,21,1,151158,3,10,2,4,2,1,2,0,0,25,1,False
4,24,1,122234,3,10,2,5,3,2,2,0,0,20,2,False


In [87]:
test_df['native_country'] = test_df['native_country'].astype(np.int64)

2. Random Forest Classifier
3. Extra Trees Classifier
4. AdaBoost Classifier
5. Gradient Boosting Machine
6. DecisionTree

AdaBoost

In [88]:
class Model_Creation(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x, y)
    
    def feature_importances(self, x, y):
        print(self.clf.fit(x, y).feature_importances_)

In [89]:
train_size = train_df.shape[0]
test_size = test_df.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, random_state=SEED)

def get_scores(clf, x_train_get, y_train_get, x_test_get):
    pred_train = np.zeros((train_size,))
    pred_test = np.zeros((test_size,))
    pred_test_kfold = np.empty((NFOLDS, test_size))
        
    for i, (train_index, val_index) in enumerate(kf.split(x_train_get)):
        x_train = x_train_get[train_index]
        y_train = y_train_get[train_index]
        x_val = x_train_get[val_index]
        
        clf.train(x_train, y_train)
        
        pred_train[val_index] = clf.predict(x_val)
        pred_test_kfold[i, :] = clf.predict(x_test_get)
        
    pred_test[:] = pred_test_kfold.mean(axis=0)
    
    pred_train = pred_train.astype(int)
    pred_test = pred_test.astype(int)
    
    clf_acc_score = accuracy_score(y_train_get, pred_train)
    clf_f1_score = f1_score(y_train_get, pred_train)
    
    return pred_train.reshape(-1, 1), pred_test.reshape(-1, 1), clf_acc_score, clf_f1_score



In [90]:
svm_params = {
    'kernel' : 'rbf'
}

dt_params = {
    'criterion' : 'gini',
    'max_depth' : 5,
    'min_samples_split' : 2
}

rf_params = {
    'n_estimators' : 100,
    'min_samples_split' : 2
}

In [91]:
#svc_model = Model_Creation(clf = SVC, seed = SEED, params = svm_params)
dt_model = Model_Creation(clf=DecisionTreeClassifier, seed=SEED, params=dt_params)

In [92]:
y_train = X['income'].values
X_train = X.drop(['income'], axis=1).values
X_test = test_df.values

In [93]:
#svc_train_result, svc_test_result, svc_acc_score, svc_f1_score = get_scores(clf=svc_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)
dt_train_result, dt_test_result, dt_acc_score, dt_f1_score = get_scores(clf=dt_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)

In [94]:
unique, counts = np.unique(dt_train_result, return_counts=True)

In [95]:
unique

array([0, 1])

In [96]:
counts

array([21759,  4290])

In [97]:
unique, counts = np.unique(dt_test_result, return_counts=True)
print(unique)
print(counts)

[0 1]
[5535  977]


In [98]:
print('Accuracy score of dt :', round(dt_acc_score, 4) * 100, '%')
print('F1-Score of dt :', round(dt_f1_score, 4) * 100)

Accuracy score of dt : 85.2 %
F1-Score of dt : 63.61


In [99]:
svc_model = Model_Creation(clf = SVC, seed = SEED, params = svm_params)
svc_train_result, svc_test_result, svc_acc_score, svc_f1_score = get_scores(clf=svc_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)

In [100]:
unique, counts = np.unique(svc_train_result, return_counts=True)
print(unique, counts)

[0 1] [25113   936]


In [101]:
print('Accuracy score of svc :', round(svc_acc_score, 4) * 100, '%')
print('F1-Score of svc :', round(svc_f1_score, 4) * 100)

Accuracy score of svc : 79.29 %
F1-Score of svc : 25.490000000000002


In [102]:
unique, counts = np.unique(svc_test_result, return_counts=True)
print(unique, counts)

[0 1] [6294  218]


In [104]:
rf_params = {
    'n_estimators' : 100,
    'min_samples_split' : 2
}
rf_model = Model_Creation(clf = RandomForestClassifier, seed = SEED, params = rf_params)
rf_train_result, rf_test_result, rf_acc_score, rf_f1_score = get_scores(clf=rf_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)

In [105]:
unique, counts = np.unique(rf_train_result, return_counts=True)
print(unique, counts)
print('Accuracy score of svc :', round(rf_acc_score, 4) * 100, '%')
print('F1-Score of svc :', round(rf_f1_score, 4) * 100)
unique, counts = np.unique(rf_test_result, return_counts=True)
print(unique, counts)

[0 1] [20780  5269]
Accuracy score of svc : 85.46000000000001 %
F1-Score of svc : 67.27
[0 1] [5551  961]


In [110]:
et_params = {
    'n_estimators': 100,
    #'max_features': 0.5,
    #'max_depth': 5,
    'min_samples_leaf': 2,
}

et_model = Model_Creation(clf = ExtraTreesClassifier, seed = SEED, params = et_params)
et_train_result, et_test_result, et_acc_score, et_f1_score = get_scores(clf=et_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)

unique, counts = np.unique(et_train_result, return_counts=True)
print(unique, counts)
print('Accuracy score of svc :', round(et_acc_score, 4) * 100, '%')
print('F1-Score of svc :', round(et_f1_score, 4) * 100)
unique, counts = np.unique(et_test_result, return_counts=True)
print(unique, counts)

[0 1] [21161  4888]
Accuracy score of svc : 85.61 %
F1-Score of svc : 66.51
[0 1] [5512 1000]


In [111]:
ada_params = {
    'n_estimators': 100,
    'learning_rate' : 0.75
}

ada_model = Model_Creation(clf = AdaBoostClassifier, seed = SEED, params = ada_params)
ada_train_result, ada_test_result, ada_acc_score, ada_f1_score = get_scores(clf=ada_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)

unique, counts = np.unique(ada_train_result, return_counts=True)
print(unique, counts)
print('Accuracy score of svc :', round(ada_acc_score, 4) * 100, '%')
print('F1-Score of svc :', round(ada_f1_score, 4) * 100)
unique, counts = np.unique(ada_test_result, return_counts=True)
print(unique, counts)

[0 1] [21168  4881]
Accuracy score of svc : 86.00999999999999 %
F1-Score of svc : 67.42
[0 1] [5459 1053]


In [113]:
gb_params = {
    'n_estimators': 100,
     #'max_features': 0.2,
    #'max_depth': 5,
    'min_samples_leaf': 2,
    #'verbose': 0
}

gb_model = Model_Creation(clf = GradientBoostingClassifier, seed = SEED, params = gb_params)
gb_train_result, gb_test_result, gb_acc_score, gb_f1_score = get_scores(clf=gb_model, x_train_get=X_train, y_train_get=y_train, x_test_get=X_test)

unique, counts = np.unique(gb_train_result, return_counts=True)
print(unique, counts)
print('Accuracy score of svc :', round(gb_acc_score, 4) * 100, '%')
print('F1-Score of svc :', round(gb_f1_score, 4) * 100)
unique, counts = np.unique(gb_test_result, return_counts=True)
print(unique, counts)

[0 1] [21163  4886]
Accuracy score of svc : 86.46000000000001 %
F1-Score of svc : 68.47999999999999
[0 1] [5445 1067]
