In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
%matplotlib inline




In [2]:
df = pd.read_csv('adult_data.csv')
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,6,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,2,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,1,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,1,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,1,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df['education'] = df['education'].map( {' ?': 0, ' Bachelors': 1, ' Some-college': 2, ' 11th': 3, ' HS-grad': 4, ' Prof-school': 5, ' Assoc-acdm': 6, ' Assoc-voc': 7, ' 9th': 8, ' 7th-8th': 9, ' 12th': 10, ' Masters': 11, ' 1st-4th': 12, ' 10th': 13, ' Doctorate': 14, ' 5th-6th': 15, ' Preschool': 16  } )
df['martial-status'] = df['martial-status'].map( {' ?': 0, ' Married-civ-spouse': 1, ' Divorced': 2, ' Never-married': 3, ' Separated': 4, ' Widowed': 5, ' Married-spouse-absent': 6, ' Married-AF-spouse': 7 } )
df['relationship'] = df['relationship'].map( {' ?': 0, ' Wife': 1, ' Own-child': 2, ' Husband': 3, ' Not-in-family': 4, ' Other-relative': 5, ' Unmarried': 6 } )
df['race'] = df['race'].map( {' ?': 0, ' White': 1, ' Asian-Pac-Islander': 2, ' Amer-Indian-Eskimo': 3, ' Other': 4, ' Black': 5 } )
df['IsMale'] = df['sex'].map( {' Female': 0, ' Male': 1} )
df['occupation'] = df['occupation'].map( {' ?': 0, ' Craft-repair': 1, ' Other-service': 2, ' Sales': 3, ' Exec-managerial': 4, ' Prof-specialty': 5, ' Handlers-cleaners': 6, ' Machine-op-inspct': 7, ' Adm-clerical': 8, ' Farming-fishing': 9,' Transport-moving': 10, ' Priv-house-serv': 11, ' Protective-serv': 12, ' Armed-Forces': 13, ' Tech-support': 14} ) 
df['IsSalary_Greaterthan50'] = df['salary'].map( {' <=50K': 50000, ' >50K': 50001} )

In [7]:
y = df.IsSalary_Greaterthan50
X = df.drop(['fnlwgt', 'education-num', 'race', 'native-country', 'sex', 'salary'], axis = 1)
X.head(3)

Unnamed: 0,age,workclass,education,martial-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,IsMale,IsSalary_Greaterthan50
0,39,6,1,3,8,4,2174,0,40,1,50000
1,50,2,1,1,4,3,0,0,13,1,50000
2,38,1,4,2,6,4,0,0,40,1,50000


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

In [10]:
model.score(X_test, y_test)

1.0

In [None]:
cv = KFold(n_splits=10)

In [None]:
for estimators in range(10, 1000, 100):
    model = RandomForestClassifier(n_estimators=estimators)
    scores = []
    for train_i, test_i in cv.split(X):
        Xr, yr, Xt, yt = X.loc[train_i], y.loc[train_i], X.loc[test_i], y.loc[test_i]
        model.fit(Xr, yr)
        scores.append(model.score(Xt, yt))
    print('estimators:', estimators, 'scores:', sum(scores)/len(scores)) 

estimators: 10 scores: 1.0
estimators: 110 scores: 1.0
estimators: 210 scores: 1.0
estimators: 310 scores: 1.0
estimators: 410 scores: 1.0
