In [None]:
import pandas
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score


tr_features = pandas.read_csv('train_features.csv')
tr_labels = pandas.read_csv('train_labels.csv')

val_features = pandas.read_csv('validation_features.csv')
val_labels = pandas.read_csv('validation_labels.csv')

te_features = pandas.read_csv('test_features.csv')
te_labels = pandas.read_csv('test_labels.csv')[:3256]

census = pandas.read_csv("final_cleaned_census.csv")

In [None]:
features = census.drop([">50k"], axis = 1)
labels = census[">50k"]

feat_train, feat_test, label_train, label_test = train_test_split(features, labels, test_size = 0.4, random_state = 42)
feat_test, feat_val, label_val, label_test = train_test_split(feat_test, label_test, test_size = 0.5, random_state = 42)

In [None]:
for dataset in (feat_train, feat_test, feat_val):
    print(len(dataset)/len(features))

In [133]:
feat_train.to_csv("train_features.csv", index = False)
feat_test.to_csv("test_features.csv", index = False)
feat_val.to_csv("validation_features.csv", index = False)

label_train.to_csv("train_labels.csv", index = False)
label_test.to_csv("test_labels.csv", index = False)
label_val.to_csv("validation_labels.csv", index = False)

In [None]:
census.drop("Unnamed: 0", axis = 1, inplace = True)

In [124]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [123]:
rf = RandomForestClassifier()

test_params = {
    'n_estimators': [5, 10, 100, 200],
    'max_depth': [2, 5, 10, 20, None]
}


cv = GridSearchCV(rf, test_params, cv = 5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 5, 'n_estimators': 200}

0.785 (+/-0.035) for {'max_depth': 2, 'n_estimators': 5}
0.772 (+/-0.019) for {'max_depth': 2, 'n_estimators': 10}
0.763 (+/-0.001) for {'max_depth': 2, 'n_estimators': 100}
0.766 (+/-0.008) for {'max_depth': 2, 'n_estimators': 200}
0.81 (+/-0.008) for {'max_depth': 5, 'n_estimators': 5}
0.812 (+/-0.008) for {'max_depth': 5, 'n_estimators': 10}
0.814 (+/-0.01) for {'max_depth': 5, 'n_estimators': 100}
0.815 (+/-0.009) for {'max_depth': 5, 'n_estimators': 200}
0.807 (+/-0.008) for {'max_depth': 10, 'n_estimators': 5}
0.809 (+/-0.012) for {'max_depth': 10, 'n_estimators': 10}
0.812 (+/-0.009) for {'max_depth': 10, 'n_estimators': 100}
0.81 (+/-0.011) for {'max_depth': 10, 'n_estimators': 200}
0.777 (+/-0.016) for {'max_depth': 20, 'n_estimators': 5}
0.783 (+/-0.008) for {'max_depth': 20, 'n_estimators': 10}
0.794 (+/-0.012) for {'max_depth': 20, 'n_estimators': 100}
0.794 (+/-0.01) for {'max_depth': 20, 'n_estimators': 200}
0.774 (+/-0.

In [154]:
# 0.813 (+/-0.012) for {'max_depth': 10, 'n_estimators': 100}
# 0.812 (+/-0.009) for {'max_depth': 5, 'n_estimators': 200}
# 0.812 (+/-0.012) for {'max_depth': 5, 'n_estimators': 100}

rf1 = RandomForestClassifier(max_depth = 10, n_estimators = 100)
rf1.fit(tr_features, tr_labels.values.ravel())

rf2 = RandomForestClassifier(max_depth = 5, n_estimators = 200)
rf2.fit(tr_features, tr_labels.values.ravel())

rf3 = RandomForestClassifier(max_depth = 5, n_estimators = 100)
rf3.fit(tr_features, tr_labels.values.ravel())

Unnamed: 0,Age,Education_num,Race,Sex,Hours_per_week,Class_ind,Occupation_ind
0,33,15,0,0,50,1,1
1,25,13,0,1,20,1,1
2,36,9,0,0,40,1,1
3,54,3,0,0,35,1,1
4,59,9,0,0,40,1,1
...,...,...,...,...,...,...,...
3252,26,6,0,0,38,1,1
3253,48,14,0,0,55,1,1
3254,47,13,0,1,50,1,1
3255,41,10,0,0,50,1,1


In [157]:
for md in [rf1, rf2, rf3]:
    prediction = md.predict(val_features)[:3256]
    accuracy = round(accuracy_score(val_labels, prediction), 3)
    precision = round(precision_score(val_labels, prediction), 3)
    recall = round(recall_score(val_labels, prediction), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(md.max_depth, md.n_estimators,accuracy,precision,  recall))

MAX DEPTH: 10 / # OF EST: 100 -- A: 0.691 / P: 0.225 / R: 0.123
MAX DEPTH: 5 / # OF EST: 200 -- A: 0.708 / P: 0.238 / R: 0.105
MAX DEPTH: 5 / # OF EST: 100 -- A: 0.708 / P: 0.238 / R: 0.105


In [160]:
y_pred = rf3.predict(te_features)
accuracy = round(accuracy_score(te_labels, y_pred), 3)
precision = round(precision_score(te_labels, y_pred), 3)
recall = round(recall_score(te_labels, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(rf2.max_depth,
                                                                     rf2.n_estimators,
                                                                     accuracy,
                                                                     precision,
                                                                     recall))

MAX DEPTH: 5 / # OF EST: 200 -- A: 0.717 / P: 0.265 / R: 0.124


In [163]:
te_features

Unnamed: 0,Age,Education_num,Race,Sex,Hours_per_week,Class_ind,Occupation_ind
0,46,9,0,0,2,1,1
1,44,8,0,0,55,1,1
2,18,7,0,0,20,1,1
3,18,10,3,1,16,0,0
4,47,13,0,0,65,1,1
...,...,...,...,...,...,...,...
3251,27,9,0,1,35,1,1
3252,21,10,0,0,40,1,1
3253,31,9,1,0,30,1,1
3254,52,9,0,0,43,1,1


In [None]:
input_categories = census.columns
person_data = {}

for category in input_categories:
    current_data = int(input("{}: ".format(category)))
    person_data[category] = current_data

person_data