In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

In [3]:
df = pd.read_csv("census.csv", skipinitialspace = True, skiprows=[0])
df.columns = ["age",
"workclass",
"fnlwgt",
"education",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"income"]
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
1,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
2,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
df = pd.get_dummies(df)
df.head(3)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income_<=50K,income_>50K
0,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
y = df["income_>50K"]

In [10]:
y.value_counts()

0    24718
1     7841
Name: income_>50K, dtype: int64

In [12]:
X = df.loc[:, :"native-country_Yugoslavia"]
X.head(3)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
count,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,...,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0,32559.0
mean,38.581283,189785.1,10.0805,1077.648269,87.309193,40.438312,0.05639,0.029485,0.064283,0.000215,...,0.001136,0.003501,0.000369,0.002457,0.001566,0.000553,0.000584,0.895851,0.002058,0.000491
std,13.640705,105549.7,2.572698,7385.514002,402.972014,12.346871,0.230677,0.169164,0.245261,0.014661,...,0.033692,0.059069,0.019195,0.049509,0.039547,0.023506,0.02415,0.305459,0.045317,0.022163
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,117833.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,37.0,178370.0,10.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,48.0,237058.0,12.0,0.0,0.0,45.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = RandomForestClassifier(n_estimators = 100, n_jobs = 20).fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.99995415788
0.856212191717


In [22]:
model.feature_importances_

array([  1.47783976e-01,   1.55931812e-01,   5.98496368e-02,
         9.04136323e-02,   3.27647734e-02,   8.31762506e-02,
         2.05873928e-03,   5.43725263e-03,   6.25298900e-03,
         7.37919791e-07,   1.15220004e-02,   6.82748025e-03,
         8.92799156e-03,   4.69786383e-03,   7.24908392e-05,
         1.41126412e-03,   1.83377935e-03,   9.73452539e-04,
         3.08557301e-04,   5.98298276e-04,   1.68956142e-03,
         1.22887066e-03,   2.74819572e-03,   2.98729819e-03,
         1.25201982e-02,   4.02687986e-03,   8.36915683e-03,
         9.31513115e-03,   4.11211691e-05,   6.10639717e-03,
         5.75703724e-03,   6.18883190e-03,   1.88523288e-04,
         6.53007257e-02,   8.10887444e-04,   2.45583745e-02,
         1.47582664e-03,   1.75687603e-03,   2.12499298e-03,
         5.84885877e-03,   1.42026052e-05,   7.19725152e-03,
         1.88091826e-02,   4.87232655e-03,   3.43418350e-03,
         4.27917699e-03,   7.46296655e-03,   9.34263047e-05,
         1.53879931e-02,