In [71]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [72]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sub = pd.read_csv('./data/sample_submission.csv')

train.shape, test.shape, sub.shape

((26049, 16), (6512, 15), (6512, 2))

In [73]:
y = train['income']
test['income'] = 'blank'

In [74]:
all_dat = pd.concat([train, test], axis=0)
all_dat.shape

(32561, 16)

In [75]:
all_dat['income'].value_counts()

<=50K    19744
blank     6512
>50K      6305
Name: income, dtype: int64

In [76]:
all_dat.loc[all_dat['income'] == '>50K', 'target'] = 1
all_dat.loc[all_dat['income'] == '<=50K', 'target'] = 0
all_dat.loc[all_dat['income'] == 'blank', 'target'] = 999
all_dat['target'] = all_dat['target'].astype('int')
all_dat.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,target
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K,1
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K,0
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K,0
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K,0
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,20,?,<=50K,0


In [77]:
all_dat['target'].value_counts()

0      19744
999     6512
1       6305
Name: target, dtype: int64

In [78]:
le = LabelEncoder()
all_dat['workclass_lbl'] = le.fit_transform(all_dat['workclass'])
all_dat.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,target,workclass_lbl
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K,1,4
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K,0,4
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K,0,4
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K,0,4
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,20,?,<=50K,0,4


In [79]:
all_dat['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [80]:
all_dat.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,target,workclass_lbl
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K,1,4
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K,0,4
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K,0,4
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K,0,4
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,20,?,<=50K,0,4


In [81]:
all_dat.columns

Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income', 'target', 'workclass_lbl'],
      dtype='object')

In [82]:
all_dat['education_lbl'] = le.fit_transform(all_dat['education'])
all_dat['marital_status_lbl'] = le.fit_transform(all_dat['marital_status'])
all_dat['occupation_lbl'] = le.fit_transform(all_dat['occupation'])
all_dat['relationship_lbl'] = le.fit_transform(all_dat['relationship'])
all_dat['race_lbl'] = le.fit_transform(all_dat['race'])
all_dat['native_country_lbl'] = le.fit_transform(all_dat['native_country'])
all_dat.head(3)

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,...,native_country,income,target,workclass_lbl,education_lbl,marital_status_lbl,occupation_lbl,relationship_lbl,race_lbl,native_country_lbl
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,...,United-States,>50K,1,4,11,2,12,0,4,39
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,...,United-States,<=50K,0,4,6,4,7,3,4,39
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,...,United-States,<=50K,0,4,15,4,8,3,4,39


In [83]:
sex_ = {"Male": 1, "Female": 2}
all_dat['sex'] = all_dat['sex'].map(sex_)
all_dat.head(3)

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,...,native_country,income,target,workclass_lbl,education_lbl,marital_status_lbl,occupation_lbl,relationship_lbl,race_lbl,native_country_lbl
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,...,United-States,>50K,1,4,11,2,12,0,4,39
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,...,United-States,<=50K,0,4,6,4,7,3,4,39
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,...,United-States,<=50K,0,4,15,4,8,3,4,39


In [84]:
## 필요없는 데이터 열을 확인
## 'workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country'
sel = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
all_dat_n = all_dat.drop(sel, axis=1)
all_dat_n

Unnamed: 0,id,age,fnlwgt,education_num,sex,capital_gain,capital_loss,hours_per_week,income,target,workclass_lbl,education_lbl,marital_status_lbl,occupation_lbl,relationship_lbl,race_lbl,native_country_lbl
0,0,40,168538,9,1,0,0,60,>50K,1,4,11,2,12,0,4,39
1,1,17,101626,5,1,0,0,20,<=50K,0,4,6,4,7,3,4,39
2,2,18,353358,10,1,0,0,16,<=50K,0,4,15,4,8,3,4,39
3,3,21,151158,10,2,0,0,25,<=50K,0,4,15,4,10,3,4,39
4,4,24,122234,10,2,0,0,20,<=50K,0,4,15,4,1,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6507,6507,35,61343,13,1,0,0,40,blank,999,4,9,2,12,0,4,39
6508,6508,41,32185,13,1,0,0,40,blank,999,5,9,2,13,0,4,39
6509,6509,39,409189,3,1,0,0,40,blank,999,4,4,2,8,0,4,26
6510,6510,35,180342,9,1,0,0,40,blank,999,4,11,2,3,0,4,39


In [85]:
train_n = all_dat_n.loc[(all_dat_n['target'] == 0) | (all_dat_n['target'] == 1), :]
test_n = all_dat_n.loc[all_dat_n['target'] == 999, :]

In [86]:
train_n.drop(['income'], axis=1, inplace=True)
test_n.drop(['income', 'target'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [89]:
sel = ['age', 'education_num', 'sex', 'hours_per_week', 'race_lbl']

X_tr_all = train_n[sel]
y_tr_all = train_n['target']
X_test_all = test_n[sel]

X_train, X_test, y_train, y_test = train_test_split(
    X_tr_all, y_tr_all,
    stratify=train_n['target'],
    random_state=0
)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((19536, 5), (19536,), (6513, 5), (6513,))

In [90]:
model = LogisticRegression(n_jobs=-1)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(model.score(X_train, y_train), model.score(X_test, y_test))
print(classification_report(y_test, pred, target_names=['<=50K', '>50K']))

0.8014946764946765 0.7985567326884692
              precision    recall  f1-score   support

       <=50K       0.82      0.93      0.88      4937
        >50K       0.64      0.38      0.47      1576

    accuracy                           0.80      6513
   macro avg       0.73      0.65      0.67      6513
weighted avg       0.78      0.80      0.78      6513



In [14]:
sel = ['id', 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'workclass']

new_train = all_dat.loc[all_dat['target'] != 999]
new_test = all_dat.loc[all_dat['target'] == 999]

X = new_train[sel]
y = new_train['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=new_train['target'],
    random_state=42
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((19536, 8), (19536,), (6513, 8), (6513,))

In [21]:
model = GradientBoostingClassifier(random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred, target_names=['<=50K', '>50K']))
print()
print(f1_score(y_test, pred))

              precision    recall  f1-score   support

       <=50K       0.85      0.96      0.90      4937
        >50K       0.81      0.47      0.59      1576

    accuracy                           0.84      6513
   macro avg       0.83      0.72      0.75      6513
weighted avg       0.84      0.84      0.83      6513


0.5917874396135266
