In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy as dc
from pandas_profiling import ProfileReport

### Preprocessing and Analysis

In [57]:
data = pd.read_csv("sample_data.csv")

In [58]:
data = data.drop_duplicates()

In [59]:
data = data.drop("fnlwgt",axis=1)

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48813 entries, 0 to 48841
Data columns (total 14 columns):
age               48813 non-null int64
workclass         46014 non-null object
education         48813 non-null object
education-num     48813 non-null int64
marital-status    48813 non-null object
occupation        46004 non-null object
relationship      48813 non-null object
race              48813 non-null object
sex               48813 non-null object
capital-gain      48813 non-null int64
capital-loss      48813 non-null int64
hours-per-week    48813 non-null int64
native-country    47957 non-null object
income            48813 non-null object
dtypes: int64(5), object(9)
memory usage: 5.6+ MB


In [6]:
data.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48813.0,48813.0,48813.0,48813.0,48813.0
mean,38.647348,10.078688,1079.708705,87.554299,40.425051
std,13.709005,2.570257,7454.185982,403.118605,12.390954
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [7]:
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
data.income.unique()

array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

In [61]:
data.income = data.income.apply(lambda x: ">50K" if x==">50K." else x)
data.income = data.income.apply(lambda x: "<=50K" if x=="<=50K." else x)

In [10]:
data["marital-status"].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [11]:
data["workclass"].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

#### There is null data in workclass column. Since there is no null data in education column, here I assume that I can impute workclass with the mode value of thw workclass column associated with the person's education level.

In [12]:
data[data.workclass.isnull()]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
27,54,,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,40,,<=50K
69,25,,Some-college,10,Never-married,,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,2,United-States,<=50K
106,17,,10th,6,Never-married,,Own-child,White,Female,34095,0,32,United-States,<=50K
128,35,,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<=50K
149,43,,Some-college,10,Divorced,,Not-in-family,White,Female,0,0,40,United-States,<=50K
154,52,,HS-grad,9,Divorced,,Not-in-family,White,Male,0,0,45,United-States,>50K
160,68,,1st-4th,2,Divorced,,Not-in-family,White,Female,0,0,20,United-States,<=50K
187,53,,Bachelors,13,Divorced,,Not-in-family,White,Female,0,0,50,United-States,<=50K


In [13]:
data[data.education.isnull()]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [62]:
data.workclass = data.groupby('education').workclass.transform(lambda x: x.fillna(x.mode()[0]))
data.occupation = data.groupby('education').occupation.transform(lambda x: x.fillna(x.mode()[0]))

#### Doing the same thing to native-country column based on race.

In [15]:
data[data.race.isnull()]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [16]:
data[data["native-country"].isnull()]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14,40,Private,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,,>50K
38,31,Private,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,38,,>50K
51,18,Private,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,30,,<=50K
61,32,Private,7th-8th,4,Married-spouse-absent,Craft-repair,Not-in-family,White,Male,0,0,40,,<=50K
93,30,Private,HS-grad,9,Married-civ-spouse,Sales,Wife,Asian-Pac-Islander,Female,0,1573,35,,<=50K
245,56,Private,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,35,,<=50K
249,45,Private,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,,<=50K
297,39,Private,Masters,14,Married-civ-spouse,Prof-specialty,Wife,Asian-Pac-Islander,Female,3464,0,40,,<=50K
393,34,State-gov,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,,>50K
453,42,Private,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,40,,>50K


In [63]:
data["native-country"] = data.groupby('race')["native-country"].transform(lambda x: x.fillna(x.mode()[0]))

In [64]:
data.income = data.income.apply(lambda x: 0 if x=="<=50K" else 1)

In [65]:
cat_col_list = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country"]
num_col_list = [col for col in data.drop("income",axis=1).columns if col not in cat_col_list]

In [66]:
for i in cat_col_list:
    data[i] = data[i].astype('category')

#### Classes are imbalanced.

In [26]:
data.groupby("income").count()

Unnamed: 0_level_0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,37128,37128,37128,37128,37128,37128,37128,37128,37128,37128,37128,37128,37128
1,11685,11685,11685,11685,11685,11685,11685,11685,11685,11685,11685,11685,11685


In [67]:
from sklearn.utils import class_weight
cw = list(class_weight.compute_class_weight('balanced',
                                             np.unique(data['income']),
                                             data['income']))

In [68]:
cw

[0.6573610213316096, 2.0887034659820283]

### Splitting data

In [69]:
from sklearn.model_selection import train_test_split
X = data.drop("income",axis=1)
y = data["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [70]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39050, 13), (9763, 13), (39050,), (9763,))

In [71]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)

In [72]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((31240, 13), (7810, 13), (31240,), (7810,))

### Training

In [73]:
from catboost import Pool, FeaturesData, CatBoostClassifier

In [91]:
fit_params = {
               "iterations" : 200, 
               "loss_function" : 'Logloss',
               "depth" : 5,
               "learning_rate" : 0.5,
                "class_weights": cw
}

In [92]:
train_pool = Pool(
                    data=FeaturesData(
                        num_feature_data = np.array(X_train[num_col_list].values, dtype=np.float32),
                        cat_feature_data = np.array(X_train[cat_col_list].values, dtype=object), 
                        num_feature_names = num_col_list, 
                        cat_feature_names = cat_col_list
                    ),
                    label =  y_train
                )

eval_pool = Pool(
                    data=FeaturesData(
                        num_feature_data = np.array(X_val[num_col_list].values, dtype=np.float32),
                        cat_feature_data = np.array(X_val[cat_col_list].values, dtype=object), 
                        num_feature_names = num_col_list, 
                        cat_feature_names = cat_col_list
                    ),
                    label =  y_val
                )

In [93]:
model = CatBoostClassifier(**fit_params, verbose=True) 

In [94]:
model.fit(train_pool, eval_set=eval_pool, verbose_eval=True)

0:	learn: 0.4564488	test: 0.4509093	best: 0.4509093 (0)	total: 130ms	remaining: 25.8s
1:	learn: 0.4096710	test: 0.4030703	best: 0.4030703 (1)	total: 258ms	remaining: 25.5s
2:	learn: 0.3933031	test: 0.3854037	best: 0.3854037 (2)	total: 349ms	remaining: 22.9s
3:	learn: 0.3817885	test: 0.3738028	best: 0.3738028 (3)	total: 477ms	remaining: 23.4s
4:	learn: 0.3771937	test: 0.3696262	best: 0.3696262 (4)	total: 590ms	remaining: 23s
5:	learn: 0.3735396	test: 0.3668621	best: 0.3668621 (5)	total: 667ms	remaining: 21.6s
6:	learn: 0.3653222	test: 0.3589881	best: 0.3589881 (6)	total: 766ms	remaining: 21.1s
7:	learn: 0.3587420	test: 0.3524770	best: 0.3524770 (7)	total: 893ms	remaining: 21.4s
8:	learn: 0.3557617	test: 0.3505441	best: 0.3505441 (8)	total: 996ms	remaining: 21.1s
9:	learn: 0.3544154	test: 0.3492916	best: 0.3492916 (9)	total: 1.08s	remaining: 20.5s
10:	learn: 0.3531732	test: 0.3487588	best: 0.3487588 (10)	total: 1.18s	remaining: 20.2s
11:	learn: 0.3510532	test: 0.3476811	best: 0.3476811 (

94:	learn: 0.3116606	test: 0.3382174	best: 0.3365210 (68)	total: 9.66s	remaining: 10.7s
95:	learn: 0.3114232	test: 0.3381271	best: 0.3365210 (68)	total: 9.78s	remaining: 10.6s
96:	learn: 0.3112876	test: 0.3381721	best: 0.3365210 (68)	total: 9.89s	remaining: 10.5s
97:	learn: 0.3109624	test: 0.3381612	best: 0.3365210 (68)	total: 10s	remaining: 10.4s
98:	learn: 0.3107008	test: 0.3382590	best: 0.3365210 (68)	total: 10.1s	remaining: 10.3s
99:	learn: 0.3103874	test: 0.3381244	best: 0.3365210 (68)	total: 10.2s	remaining: 10.2s
100:	learn: 0.3099429	test: 0.3380634	best: 0.3365210 (68)	total: 10.4s	remaining: 10.2s
101:	learn: 0.3096851	test: 0.3381678	best: 0.3365210 (68)	total: 10.5s	remaining: 10s
102:	learn: 0.3096076	test: 0.3381457	best: 0.3365210 (68)	total: 10.5s	remaining: 9.91s
103:	learn: 0.3092587	test: 0.3381648	best: 0.3365210 (68)	total: 10.6s	remaining: 9.77s
104:	learn: 0.3091086	test: 0.3382782	best: 0.3365210 (68)	total: 10.7s	remaining: 9.67s
105:	learn: 0.3087496	test: 0.3

186:	learn: 0.2912471	test: 0.3418797	best: 0.3360877 (108)	total: 18.9s	remaining: 1.32s
187:	learn: 0.2910323	test: 0.3419584	best: 0.3360877 (108)	total: 19.1s	remaining: 1.22s
188:	learn: 0.2907702	test: 0.3420658	best: 0.3360877 (108)	total: 19.2s	remaining: 1.11s
189:	learn: 0.2905855	test: 0.3422851	best: 0.3360877 (108)	total: 19.3s	remaining: 1.01s
190:	learn: 0.2904476	test: 0.3423360	best: 0.3360877 (108)	total: 19.4s	remaining: 912ms
191:	learn: 0.2903894	test: 0.3423591	best: 0.3360877 (108)	total: 19.5s	remaining: 812ms
192:	learn: 0.2903358	test: 0.3424043	best: 0.3360877 (108)	total: 19.6s	remaining: 709ms
193:	learn: 0.2900926	test: 0.3421005	best: 0.3360877 (108)	total: 19.6s	remaining: 607ms
194:	learn: 0.2899832	test: 0.3422132	best: 0.3360877 (108)	total: 19.7s	remaining: 506ms
195:	learn: 0.2897599	test: 0.3420596	best: 0.3360877 (108)	total: 19.8s	remaining: 404ms
196:	learn: 0.2895235	test: 0.3421630	best: 0.3360877 (108)	total: 19.9s	remaining: 303ms
197:	learn

<catboost.core.CatBoostClassifier at 0x20301b5c160>

In [95]:
index_feature_importance = X_train.columns
df_feature_importance = pd.DataFrame(index=index_feature_importance, data={"out_importance": model.feature_importances_}).sort_values('out_importance', ascending=False)
df_feature_importance

Unnamed: 0,out_importance
education,22.031373
capital-gain,20.172137
age,12.342756
marital-status,8.806234
education-num,8.123282
sex,7.562003
workclass,7.417342
race,5.662713
occupation,2.229643
relationship,1.881925


In [96]:
test_pool = Pool(
                    data=FeaturesData(
                        num_feature_data = np.array(X_test[num_col_list].values, dtype=np.float32),
                        cat_feature_data = np.array(X_test[cat_col_list].values, dtype=object), 
                        num_feature_names = num_col_list, 
                        cat_feature_names = cat_col_list
                    )
                )
y_preds = model.predict(test_pool)

In [97]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_preds)

array([[6059, 1318],
       [ 312, 2074]], dtype=int64)

#### Our model is better at predicting the positive samples (in our data, the class with income >50K)

In [98]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precision = precision_score(y_test, y_preds)
recall = recall_score(y_test, y_preds)

In [99]:
precision

0.6114386792452831

In [100]:
recall

0.8692372170997485

In [101]:
from sklearn.ensemble import RandomForestClassifier

#### To be able to use a RandomForest Model, we need to transform the categorical features.

In [102]:
data_onehot = data.copy()
for i in cat_col_list:
    data_onehot = pd.get_dummies(data_onehot, columns=[i], prefix = [i])

data_onehot.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,13,2174,0,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


#### To prevent curse of dimensionality, we should apply dimensionality reduction.

In [103]:
from sklearn.decomposition import PCA
pca = PCA(2) 
pca.fit(data_onehot.drop("income",axis=1)) 
pca_data = pd.DataFrame(pca.transform(data_onehot.drop("income",axis=1)))

In [104]:
pca_data.head()

Unnamed: 0,0,1
0,1094.439192,-85.685723
1,-1079.559702,-89.41889
2,-1079.557917,-89.398302
3,-1079.55586,-89.368831
4,-1079.559155,-89.416244


In [105]:
X_train, X_test, y_train, y_test = train_test_split(pca_data, y, test_size = 0.2)

In [106]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
RandomForestClassifier(max_depth=2, random_state=0, class_weight=cw)



RandomForestClassifier(bootstrap=True,
                       class_weight=[0.6573610213316096, 2.0887034659820283],
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators='warn', n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [107]:
y_preds = clf.predict(X_test)

In [108]:
confusion_matrix(y_test, y_preds)

array([[7382,   25],
       [1911,  445]], dtype=int64)

In [109]:
precision = precision_score(y_test, y_preds)
recall = recall_score(y_test, y_preds)

#### CatBoost Model gave better results than RandomForest Model in precision, but RF Model gave much better results in precision.

In [110]:
precision

0.9468085106382979

In [111]:
recall

0.18887945670628184

#### Here, RF Model is better at "not labeling a person's income as >50K when it's <=50K". CatBoost model is better at "labeling a person's income as >50K when it is actually >50K". Therefore, the model selection depends on what side we want to be more accurate on. But if we want to to choose a model which has balance between precision and recall values, we should choose CatBoost Model.