In [12]:
import numpy as np
import pandas as pd
import urllib.request
from sklearn.preprocessing import Imputer

# UCI Adult Dataset - https://archive.ics.uci.edu/ml/datasets/Adult
data_train = pd.read_csv("adult.train.csv", delimiter=",", header=None)
data_test = pd.read_csv("adult.test.csv", delimiter=",", header=None)
print(data_train.shape, data_test.shape)

(32561, 15) (16281, 15)


In [11]:
data_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [15]:
data_train.columns = ["age", "workclass", "fnlwgt", "education", "education_num", 
                      "marital_status", "occupation", "relationship", "race", "sex", 
                      "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
data_test.columns = ["age", "workclass", "fnlwgt", "education", "education_num", 
                     "marital_status", "occupation", "relationship", "race", "sex", 
                     "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]

In [16]:
data_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [18]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [190]:
class Dataset:
    'Common base class for all datasets'
    import numpy as np
    import pandas as pd, os
    from sklearn.feature_extraction import DictVectorizer

    def __init__(self, data, target_names=[], feature_names=[], yPos="last", yTransform=None):
        nCol = data.shape[1]
        self.data = data.ix[:, range(nCol-1)]
        self.target = data.ix[:, [nCol-1]]
        self.target = self.target.iloc[:].applymap(yTransform)
        self.target_names = target_names
        self.feature_names = feature_names
   
    def shape(self):
        print("X: ", self.data.shape)
        print("y: ", self.target.shape)
        print("Target Classes: ", len(self.target_names))
        print("No of Features: ", len(self.feature_names), "\n")
    
    def sample(self):
        print("X:\n", self.data.head())
        print("y:\n", self.target.head())
        
    #http://stackoverflow.com/questions/15021521/how-to-encode-a-categorical-variable-in-sklearn
    def one_hot_dataframe(self, cols, replace=False):
        vec = DictVectorizer()
        mkdict = lambda row: dict((col, row[col]) for col in cols)
        vecData = pd.DataFrame(vec.fit_transform(self.data[cols].apply(mkdict, axis=1)).toarray())
        vecData.columns = vec.get_feature_names()
        vecData.index = self.data.index
        if replace is True:
            self.data = self.data.drop(cols, axis=1)
            self.data = self.data.join(vecData)
        return (1)
    
    def feature_engineer(self):
        self.one_hot_dataframe( 
                              ['workclass', 'education', 'marital_status',
                              'occupation', 'relationship', 'race', 'sex',
                              'native_country'], replace=True)
        return(1)


In [199]:
train = Dataset(data_train, yTransform = lambda x: 1 if (x == ' >50K') else 0)
test = Dataset(data_test, yTransform = lambda x: 1 if (x == ' >50K') else 0)
train.shape()
test.shape()
train.sample()
test.sample()

X:  (32561, 14)
y:  (32561, 1)
Target Classes:  0
No of Features:  0 

X:  (16281, 14)
y:  (16281, 1)
Target Classes:  0
No of Features:  0 

X:
    age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Femal

In [217]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, cross_val_score
import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer

# https://github.com/paulgb/sklearn-pandas
train.feature_engineer()
test.feature_engineer()
#set(train.data.columns) - set(test.data.columns)
test.data[list(set(train.data.columns) - set(test.data.columns))[0]] = 0
print(train.data.shape, test.data.shape)

KeyError: "['workclass' 'education' 'marital_status' 'occupation' 'relationship'\n 'race' 'sex' 'native_country'] not in index"

In [218]:
test.data.shape

(16281, 108)

In [225]:
from sklearn.metrics import classification_report

model = RandomForestClassifier()
model.fit(train.data, train.target)
y_pred = model.predict(test.data)

classification_report(test.target.as_matrix(), y_pred)

# display the relative importance of each attribute
#print(model.feature_importances_)

  'recall', 'true', average, warn_for)


'             precision    recall  f1-score   support\n\n          0       1.00      0.92      0.96     16281\n          1       0.00      0.00      0.00         0\n\navg / total       1.00      0.92      0.96     16281\n'