In [1]:
# Income prediction using files from pm-se-onboarding-nick (git user nmball42)
# Uploaded to new public repo income-prediction on nmball42
# Same procedure can be used for any other new project if can be in public repo
# Run in Oracle ADSCS Py2.7 Standard environment on demo-next.datascience.com

# Data from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.{data,test}.txt
# In ~/Oracle/data/income/ on laptop
# Renamed there as income_{train,test}.csv
# Data uploaded to project repo files so can access from this notebook

# Nov 08th 2018

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

In [3]:
data_dir = ''
col_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','yearly-income']

In [4]:
data_train = pd.read_csv(data_dir + 'income_train.csv', names = col_names)

In [5]:
data_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,yearly-income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [6]:
data_test = pd.read_csv(data_dir + 'income_test.csv', names = col_names, skiprows = 1)

In [7]:
### Remove period in last col of test ###

In [8]:
data_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,yearly-income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K.
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K.
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K.
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K.


In [9]:
### Make some plots ###

In [10]:
# Separate features and targets to pass to ML model
train = data_train.drop('yearly-income',axis=1)
labels = data_train['yearly-income']

In [11]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States


In [12]:
labels

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
5         <=50K
6         <=50K
7          >50K
8          >50K
9          >50K
10         >50K
11         >50K
12        <=50K
13        <=50K
14         >50K
15        <=50K
16        <=50K
17        <=50K
18        <=50K
19         >50K
20         >50K
21        <=50K
22        <=50K
23        <=50K
24        <=50K
25         >50K
26        <=50K
27         >50K
28        <=50K
29        <=50K
          ...  
32531     <=50K
32532      >50K
32533      >50K
32534     <=50K
32535     <=50K
32536      >50K
32537     <=50K
32538      >50K
32539      >50K
32540     <=50K
32541     <=50K
32542     <=50K
32543     <=50K
32544     <=50K
32545      >50K
32546     <=50K
32547     <=50K
32548     <=50K
32549     <=50K
32550     <=50K
32551     <=50K
32552     <=50K
32553     <=50K
32554      >50K
32555     <=50K
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: yearly-income, Len

In [13]:
# One-hot encode training data: input has to be numerical array
train_encoded = pd.get_dummies(train)

In [14]:
train_encoded

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,37,284582,14,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,49,160187,5,0,0,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,52,209642,9,0,0,45,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,31,45781,14,14084,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,42,159449,13,5178,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
# Generic ML model from scikit examples
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [16]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [17]:
print(clf.feature_importances_)

[ 0.14205973  0.76664038  0.0282433   0.06305659]


In [18]:
print(clf.predict([[0, 0, 0, 0]]))

[1]


In [19]:
# Run ML on income dataset

In [20]:
# Based on Build_model.ipynb in demo-next.datascience.com Optimizing Investment Strategy project
# Ref: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
clf = RandomForestClassifier()

# Specify hyperparameter space to optimize
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [21]:
# Run randomized grid search
# Ref: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)
random_search.fit(train_encoded,labels) # This line builds the model

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'bootstrap': [True, False], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f93da7c0ed0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f93db646110>, 'criterion': ['gini', 'entropy'], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f93da7c0c90>, 'max_depth': [3, None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=

In [22]:
# See model information
random_search.cv_results_ # Etc.

{'mean_fit_time': array([ 0.10392865,  0.39507596,  0.12789551,  0.0882163 ,  0.15210128,
         0.33685033,  0.24483864,  0.23776571,  0.14019608,  0.3299617 ]),
 'mean_score_time': array([ 0.02656221,  0.03553367,  0.06596073,  0.02702459,  0.03125906,
         0.03741511,  0.03392537,  0.03361225,  0.02477272,  0.03737807]),
 'mean_test_score': array([ 0.78676945,  0.85967876,  0.84426154,  0.76115598,  0.8014803 ,
         0.85749823,  0.8523694 ,  0.85427352,  0.79503087,  0.85918737]),
 'mean_train_score': array([ 0.78627794,  0.87091919,  0.85109481,  0.76163204,  0.8043977 ,
         0.883296  ,  0.8602777 ,  0.86075368,  0.79530724,  0.88410986]),
 'param_bootstrap': masked_array(data = [True False True False False False False True False False],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'param_criterion': masked_array(data = ['gini' 'entropy' 'gini' 'gini' 'entropy' 'entropy' 'entropy' 'entropy'
  'entropy' 

In [None]:
### Score model on testing set ###

In [None]:
### Importances, PDP plots are properties of RF not CV estimator ###