In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train = pd.read_csv('./data/train.csv')

train.shape

(26049, 16)

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26049 entries, 0 to 26048
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26049 non-null  int64 
 1   age             26049 non-null  int64 
 2   workclass       26049 non-null  object
 3   fnlwgt          26049 non-null  int64 
 4   education       26049 non-null  object
 5   education_num   26049 non-null  int64 
 6   marital_status  26049 non-null  object
 7   occupation      26049 non-null  object
 8   relationship    26049 non-null  object
 9   race            26049 non-null  object
 10  sex             26049 non-null  object
 11  capital_gain    26049 non-null  int64 
 12  capital_loss    26049 non-null  int64 
 13  hours_per_week  26049 non-null  int64 
 14  native_country  26049 non-null  object
 15  income          26049 non-null  object
dtypes: int64(7), object(9)
memory usage: 3.2+ MB


In [4]:
train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,20,?,<=50K


In [6]:
train.loc[train['income'] == '>50K', 'target'] = 1
train.loc[train['income'] == '<=50K', 'target'] = 0
train['target'] = train['target'].astype('int')
le = LabelEncoder()
train['sex_labeled'] = le.fit_transform(train['sex'])
train['workclass_labeled'] = le.fit_transform(train['workclass'])

In [8]:
sel = ['age', 'workclass_labeled', 'fnlwgt', 'education_num', 'sex_labeled', 'capital_gain', 'capital_loss', 'hours_per_week']
X = train[sel]
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=train['target'],
    random_state=0
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((19536, 8), (19536,), (6513, 8), (6513,))

In [11]:
model = RandomForestClassifier(n_jobs=-1, random_state=0)
model.fit(X_train, y_train)
score = cross_val_score(model, X, y, n_jobs=-1, scoring='roc_auc')
print(f'mean cross validation score(AUC): {score.mean()}')

mean cross validation score(AUC): 0.8508620200260172


In [12]:
model = GradientBoostingClassifier(random_state=0)
model.fit(X_train, y_train)
score = cross_val_score(model, X, y, n_jobs=-1, scoring='roc_auc')
print(f'mean cross validation score(AUC): {score.mean()}')

mean cross validation score(AUC): 0.8820857112408429


In [13]:
model = LogisticRegression(n_jobs=-1)
model.fit(X_train, y_train)
score = cross_val_score(model, X, y, n_jobs=-1, scoring='roc_auc')
print(f'mean cross validation score(AUC): {score.mean()}')

mean cross validation score(AUC): 0.5806404982473807


In [15]:
model = KNeighborsClassifier(n_jobs=-1)
model.fit(X_train, y_train)
score = cross_val_score(model, X, y, n_jobs=-1, scoring='roc_auc')
print(f'mean cross validation score(AUC): {score.mean()}')

mean cross validation score(AUC): 0.6654839676868098


In [16]:
n_estimators = [i * 100 for i in range(1, 6)]
lr_list = [10, 1, 0.1, 0.01, 0.001]

estimators = []
lr_lst = []
train_score = []
test_score = []
auc_score = []

for n in n_estimators:
    for lr in lr_list:
        model = GradientBoostingClassifier(
            learning_rate=lr,
            n_estimators=n,
            random_state=0
        )
        model.fit(X_train, y_train)

        estimators.append(n)
        lr_lst.append(lr)
        train_score.append(model.score(X_train, y_train))
        test_score.append(model.score(X_test, y_test))
        auc_score.append(cross_val_score(model, X, y, n_jobs=-1, scoring='roc_auc').mean())

data = {
    'estimators': estimators,
    'learning_rate': lr_lst,
    'train_score': train_score,
    'test_score': test_score,
    'auc_score': auc_score
}

df = pd.DataFrame(data)
df

Unnamed: 0,estimators,learning_rate,train_score,test_score,auc_score
0,100,10.0,0.239865,0.237218,0.50565
1,100,1.0,0.876894,0.843083,0.873443
2,100,0.1,0.852375,0.844926,0.882086
3,100,0.01,0.81163,0.811761,0.85526
4,100,0.001,0.757934,0.758022,0.797006
5,200,10.0,0.239865,0.237218,0.50565
6,200,1.0,0.898034,0.83556,0.865629
7,200,0.1,0.858825,0.848764,0.887169
8,200,0.01,0.836405,0.834638,0.866419
9,200,0.001,0.757934,0.758022,0.803146


In [18]:
max_auc = df.loc[df['auc_score'] == df['auc_score'].max()]
max_auc

Unnamed: 0,estimators,learning_rate,train_score,test_score,auc_score
17,400,0.1,0.865223,0.851221,0.889057


In [19]:
model = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=400,
    random_state=0
)
model.fit(X_train, y_train)

cross_val_score(model, X, y, n_jobs=-1, scoring='roc_auc').mean()

0.8890568331570121