### Read Data into Pandas Data Frames

In [1]:
trainFile = "adult_train.csv" #Change it to your local directory
testFile = "adult_test.csv"

import pandas as pd
trainData = pd.read_csv(trainFile, sep=",", header=0)
testData = pd.read_csv(testFile, sep=",", header=0)
trainData.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Get X and Y from Training Data

In [2]:
X1 = trainData.iloc[:, 0:14]
X2 = testData.iloc[:, 0:14]
Y1 = trainData.iloc[:, 14]
Y2 = testData.iloc[:, 14]
frames = [X1, X2]
X = pd.concat(frames)

import category_encoders as ce
le =  ce.OneHotEncoder(return_df=False,handle_unknown="ignore")
X_encoded = le.fit_transform(X)
X_encoded_train = X_encoded[0:X1.shape[0], :]
X_encoded_test = X_encoded[X1.shape[0]:X_encoded.shape[0], :]

### Section 2. Train the Random Forest Model

In [3]:
nTrees = 100
max_depth = 5
min_node_size = 5
verbose = 0

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=nTrees, max_depth=max_depth, random_state=0, verbose=verbose, min_samples_leaf=min_node_size)
clf.fit(X_encoded_train, Y1)
print(clf.feature_importances_)

[3.88591889e-02 5.20675248e-05 8.96685261e-04 9.28523783e-04
 7.99485613e-04 1.24533629e-04 3.28126376e-04 5.86441202e-03
 0.00000000e+00 0.00000000e+00 4.60659940e-04 2.18276983e-02
 1.02723034e-02 2.01304682e-03 1.89384862e-02 2.71629574e-05
 8.40704341e-04 5.80420424e-05 2.61565160e-05 8.26822443e-04
 3.11280319e-03 7.67655367e-03 3.60425027e-04 1.45989349e-04
 1.59874295e-04 2.00819300e-05 5.05964185e-06 1.03784527e-01
 7.64147707e-02 1.72755570e-01 7.22704036e-03 3.76093842e-05
 7.99188415e-04 6.94767440e-05 2.56165347e-04 1.52857956e-03
 2.38713738e-02 5.93685408e-04 2.34645371e-02 4.14910059e-03
 2.72443666e-04 6.34313559e-04 1.66054278e-04 1.73845967e-03
 4.68559864e-04 4.26550102e-04 6.67499855e-04 8.30726479e-05
 0.00000000e+00 5.15318408e-06 2.92772717e-02 1.30207265e-01
 1.75048848e-02 2.25852882e-02 7.72468338e-03 5.68153439e-04
 1.21576982e-03 4.61293429e-04 1.40603786e-04 5.28828044e-05
 1.74367292e-05 2.39455838e-02 1.81441507e-02 1.51833802e-01
 3.05301514e-02 2.999402

### Predict the Testing Data and Evaluate

In [4]:
import numpy as np
Y_test_hat = clf.predict(X_encoded_test)
Accuracy = [1 for i in range(len(Y_test_hat)) if Y2.iloc[i] == Y_test_hat[i]]
Accuracy = round(float(np.sum(Accuracy))/len(Y_test_hat)*100,2)
print("Accuracy on Testing Data = %.2f%%"%Accuracy)

Accuracy on Testing Data = 83.91%


### Section 3. Train the Gradient Boosted Decision Trees Model

In [5]:
nTrees = 100
max_depth = 5
min_node_size = 5
verbose = 0
learning_rate = 0.05

from sklearn.ensemble import GradientBoostingClassifier
gbm_clf = GradientBoostingClassifier(n_estimators=nTrees, loss='deviance', learning_rate=learning_rate, max_depth=max_depth, \
                                    min_samples_leaf=min_node_size)
gbm_clf.fit(X_encoded_train, Y1)
print(gbm_clf.feature_importances_)

[5.42201582e-02 6.88439696e-05 5.35416177e-03 3.60576568e-04
 2.07578129e-03 1.17409449e-03 1.82800014e-05 5.69301701e-04
 0.00000000e+00 0.00000000e+00 5.41859106e-03 6.74155563e-04
 1.00345407e-04 0.00000000e+00 3.54133036e-04 0.00000000e+00
 1.12505730e-04 4.38189778e-06 6.10943699e-05 0.00000000e+00
 3.63715038e-04 2.70139994e-04 0.00000000e+00 1.02637365e-04
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.97920653e-01
 8.22246609e-04 3.80491791e-01 2.77608240e-05 3.66837707e-06
 2.01797677e-05 6.91117567e-04 1.49745083e-05 4.34720243e-04
 1.62872011e-02 3.52132487e-04 8.62100835e-03 4.83439327e-03
 1.72995050e-03 3.08963762e-05 2.21652185e-04 4.76074551e-03
 8.38525992e-05 2.87937086e-03 3.35663879e-05 7.76673765e-04
 0.00000000e+00 0.00000000e+00 5.58128228e-04 7.69094889e-04
 4.48240507e-03 3.21519325e-04 2.86241179e-04 1.84133879e-04
 5.22238971e-04 2.80660872e-04 4.30762281e-05 7.40982153e-05
 0.00000000e+00 1.89649387e-03 1.57782654e-03 1.99080797e-01
 6.26461621e-02 3.359601

### Predict the Testing Data and Evaluate

In [6]:
import numpy as np
Y_test_hat = gbm_clf.predict(X_encoded_test)
Accuracy = [1 for i in range(len(Y_test_hat)) if Y2.iloc[i] == Y_test_hat[i]]
Accuracy = round(float(np.sum(Accuracy))/len(Y_test_hat)*100,2)
print("Accuracy on Testing Data = %.2f%%"%Accuracy)

Accuracy on Testing Data = 87.26%
