In [2]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append('/Users/danielruiz/Dropbox/Grad School/Fall 2018/CS273A/Project/')
import mltools as ml
import data_loader
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# load data (provided method)
train_data, valid_data = data_loader.load_train_data('Data/adult.data.txt', valid_rate=0.1, is_df=True)
test_data = data_loader.load_test_data('Data/adult.test.txt', is_df=True)

In [4]:
# specify columns labels
features = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation',
                'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country',
                'income']
train_data.columns = features
valid_data.columns = features
test_data.columns = features

# drop unnecessary features (fnlwgt, education)
train_data = train_data.drop(columns = ['fnlwgt', 'education'])
valid_data = valid_data.drop(columns = ['fnlwgt', 'education'])
test_data = test_data.drop(columns = ['fnlwgt', 'education'])

In [11]:
# numerical processing pipeline
num_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
num_transformer = Pipeline(steps = [('scaler', StandardScaler())]) # currently just the scaler

In [46]:
# categorical processing pipeline
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                'native-country']
cat_transformer = Pipeline(steps = [('imputer', SimpleImputer(missing_values = ' ?',strategy = 'most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [47]:
# create preprocessor containing above pipelines
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features), 
                                               ('cat', cat_transformer, cat_features)])

In [48]:
# create a new pipeline containign the preprocessor and a classifier
full_pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression(solver='lbfgs'))])

In [49]:
# convert target values to binary categories
train_data = train_data.replace(to_replace = ' >50K', value = 1)
train_data = train_data.replace(to_replace = ' <=50K', value = 0)
valid_data = valid_data.replace(to_replace = ' >50K', value = 1)
valid_data = valid_data.replace(to_replace = ' <=50K', value = 0)
test_data = test_data.replace(to_replace = ' >50K', value = 1)
test_data = test_data.replace(to_replace = ' <=50K', value = 0)

In [50]:
# remove target values
X = train_data.drop(columns = 'income')
Y = train_data['income']

In [51]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [52]:
full_pipe.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['age', 'education-num', 'capital...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [54]:
print('Model Score: ' + str(full_pipe.score(X_test, Y_test)))

Model Score: 0.8548524646085621


In [89]:
# count nan values in each column
null_columns = train_data_nan.columns[train_data_nan.isnull().any()]
train_data_nan[null_columns].isnull().sum()

workclass         1662
occupation        1669
native-country     498
dtype: int64

In [90]:
# remove rows containing nan values
tr_clean = train_data_nan.dropna()

In [91]:
tr_clean.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,27175.0,27175.0,27175.0,27175.0,27175.0,27175.0
mean,38.430874,189422.4,10.111279,1111.609457,89.200626,40.931923
std,13.117079,105888.4,2.554387,7503.11821,405.93624,11.942297
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117363.0,9.0,0.0,0.0,40.0
50%,37.0,177955.0,10.0,0.0,0.0,40.0
75%,47.0,236993.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0
