# Summary
We now use the best models derived in Chapter 2 and assess them against the test data.





In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler

%matplotlib inline

In [2]:
# preprocessing function - developped in Chapter 1
the_columns  = [('age', 'continuous'), 
                ('class_of_worker', 'nominal'), 
                ('detailed_industry_code', 'nominal'), 
                ('detailed_occupation_code', 'nominal'), 
                ('education', 'nominal'), 
                ('wage_per_hour', 'continuous'), 
                ('enrolled_in_edu_last_week', 'nominal'),
                ('marital_status', 'nominal'),
                ('major_industry_code', 'nominal'),
                ('major_occupation_code', 'nominal'),
                ('race', 'nominal'),
                ('hispanic_origin', 'nominal'),
                ('sex', 'binary'), # binary column with values Male/Female
                ('member_of_labor_union', 'nominal'), 
                ('reason_for_unemployment', 'nominal'),
                ('full_or_part_time_employment_stat', 'nominal'),
                ('capital_gains', 'continuous'),
                ('capital_losses', 'continuous'),
                ('dividends', 'continuous'),
                ('tax_filer', 'nominal'),
                ('region_of_previous_residence', 'nominal'),
                ('state_of_previous_residence', 'nominal'),
                ('detailed_household_family_stat', 'nominal'),
                ('detailed_household_summary', 'nominal'),
                ('instance_weight', 'IGNORE'), # as per instructions, to be dropped
                ('migration_code_change_in_msa', 'nominal'),
                ('migration_code_change_in_reg', 'nominal'),
                ('migration_code_move_within_reg', 'nominal'),
                ('live_in_this_house_1_yr_ago', 'nominal'),
                ('migration_prev_res_in_sunbelt', 'nominal'),
                ('num_persons_worked_for_employer', 'continuous'),
                ('family_members_under_18', 'nominal'),
                ('cob_father', 'nominal'),
                ('cob_mother', 'nominal'),
                ('cob_self', 'nominal'),
                ('citizenship', 'nominal'),
                ('own_business_or_self_employed', 'nominal'),
                ('fill_in_questionnaire_for_veterans_admin', 'nominal'),
                ('veterans_benefits', 'nominal'),
                ('weeks_worked_in_year', 'nominal'),
                ('year', 'nominal'), 
                ('savings','target')] # binary TARGET variable


In [3]:
def preprocessData(file_name):
    # the_columns stores tuples of (column_name and tag for continuous/nominal/binary/target)
    
    raw_data = pd.read_csv(file_name, names=[c[0] for c in the_columns], index_col=False)
    original_shape = raw_data.shape
    
    raw_data.drop('instance_weight', axis=1, inplace=True)
    #the_columns.remove(('instance_weight', 'IGNORE'))
    
    # find the duplicate rows, keep the first one
    duplicate_rows = raw_data.duplicated(keep='first')
    
    print 'number of duplicates = {:d}'.format(duplicate_rows.sum())
    raw_data = raw_data.drop_duplicates(keep='first')
    new_shape =  raw_data.shape
    print 'number of duplicates removed = {:d}'.format(original_shape[0] - new_shape[0])
    print 'original shape = {:d}, {:d}'.format(original_shape[0], original_shape[1])
    print 'new shape = {:d}, {:d}'.format(raw_data.shape[0], raw_data.shape[1])
    
    # convert nominal columns (object dtype) to integer type
    data = pd.DataFrame(raw_data.select_dtypes(include=['object']))
    object_columns = data.columns
    
    for column in object_columns:
        unique_values = data[column].unique()
        dictionary = {key:idx for idx,key in enumerate(unique_values)}
        data[column] = data[column].apply(lambda x : dictionary[x])
    
    # add nominal columns that were already in integer format 
    nominal_integer_columns = [c[0] for c in the_columns 
                               if c[1] == 'nominal' and c[0] not in data.columns]
    data[nominal_integer_columns] = raw_data[nominal_integer_columns]
    
    # convert 'sex', and 'savings' columns to binary; add year column
    data['savings'] = raw_data['savings'].map(lambda x: 
                                              1 if str(x).strip() == '50000+.' else 0)
    data['sex'] = raw_data['sex'].map(lambda x: 
                                      1 if str(x).strip() == 'Male' else 0)
    data['year'] = raw_data['year']
    
    # add continuous columns
    continuous_columns = [c[0] for c in the_columns if c[1] == 'continuous']
    data[continuous_columns] = raw_data[continuous_columns]
    
    # verify that we aren't missing any columns
    assert set(data.columns) == (set(raw_data.columns))
    
    text = 'The final processed data has {:,d} rows and {:d} columns.\n'
    print text.format(data.shape[0], data.shape[1])
    return data


In [4]:
train_data = preprocessData('us_census_full/census_income_learn.csv')

number of duplicates = 46627
number of duplicates removed = 46627
original shape = 199523, 42
new shape = 152896, 41
The final processed data has 152,896 rows and 41 columns.



In [5]:
test_data = preprocessData('us_census_full/census_income_test.csv')

number of duplicates = 20898
number of duplicates removed = 20898
original shape = 99762, 42
new shape = 78864, 41
The final processed data has 78,864 rows and 41 columns.



## Class Imbalance
We found in Chapter 1 that the training data exhibits some degree of class imbalance where 91.2% of persons had savings less than 50K. The test set has approximately the same degree of class imbalance, calculated below at 91.5%. 

In [6]:
n = test_data[test_data['savings'] == 1]['savings'].count()
m = test_data[test_data['savings'] == 0]['savings'].count()
imbalance = (1.0 - float(n)/m)*100
print '\nRecords with savings > 50K: {:8,d}'.format(n)
print 'Records with savings < 50K: {:8,d}'.format(m)
print 'The class imbalance is {:.4f}% or approximately {:d} to 1.\n'.format(imbalance, m/n)


Records with savings > 50K:    6,186
Records with savings < 50K:   72,678
The class imbalance is 91.4885% or approximately 11 to 1.



In [7]:
def print_confusion_matrix(y_true, y_pred):
    header = '\t          prediction 0    prediction 1'
    row0 =   '\tclass 0 {:11,d} {:14,d}'
    row1 =   '\tclass 1 {:11,d} {:14,d}'
    cm = confusion_matrix(y_true, y_pred)
    print header
    print row0.format(cm[0,0], cm[0,1])
    print row1.format(cm[1,0], cm[1,1])

In [8]:
print_confusion_matrix(test_data.loc[:,'savings'], test_data.loc[:,'savings'])

	          prediction 0    prediction 1
	class 0      72,678              0
	class 1           0          6,186


## Training and Test Data

In [9]:
X_train = train_data.drop('savings', axis=1)
y_train = train_data.loc[:,'savings']

In [10]:
X_test = test_data.drop('savings', axis=1)
y_test = test_data.loc[:,'savings']

## Best Random Forest Scored by Accuracy

best_features_for_accuracy = {'max_features': 'sqrt', 
                              'min_samples_split': 2, 
                              'n_estimators': 200, 
                              'max_depth': None, 
                              'class_weight': None}

In [11]:
rf_acc = RandomForestClassifier(max_features='sqrt',
                                min_samples_split=2, 
                                n_estimators=200, 
                                max_depth=None, 
                                class_weight=None, 
                                n_jobs=2)

rf_acc.fit(X_train, y_train)

y_pred = rf_acc.predict(X_test)

print 'Confusion matrix:'
print_confusion_matrix(y_test, y_pred)
print '\nClassification report:'
print classification_report(y_test, y_pred, digits=5)


Confusion matrix:
	          prediction 0    prediction 1
	class 0      72,449            229
	class 1       4,907          1,279

Classification report:
             precision    recall  f1-score   support

          0    0.93657   0.99685   0.96577     72678
          1    0.84814   0.20676   0.33247      6186

avg / total    0.92963   0.93488   0.91609     78864



## Best Random Forest Scored By Precision

best_features_for_precision = {'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200, 'max_depth': 6, 'class_weight': None}

In [12]:
rf_pre = RandomForestClassifier(max_features='sqrt',
                                min_samples_split=2, 
                                n_estimators=200, 
                                max_depth=6, 
                                class_weight=None, 
                                n_jobs=2)

rf_pre.fit(X_train, y_train)

y_pred = rf_pre.predict(X_test)

print 'Confusion matrix:'
print_confusion_matrix(y_test, y_pred)
print '\nClassification report:'
print classification_report(y_test, y_pred, digits=5)


Confusion matrix:
	          prediction 0    prediction 1
	class 0      72,631             47
	class 1       5,611            575

Classification report:
             precision    recall  f1-score   support

          0    0.92829   0.99935   0.96251     72678
          1    0.92444   0.09295   0.16892      6186

avg / total    0.92798   0.92826   0.90026     78864



## Best Random Forest Scored By Recall

best_features_for_recall = {'max_features': 40, 'min_samples_split': 2, 'n_estimators': 200, 'max_depth': 6, 'class_weight': {1: 5}}

In [13]:
rf_rec = RandomForestClassifier(max_features=40,
                                min_samples_split=2, 
                                n_estimators=200, 
                                max_depth=6, 
                                class_weight={1:5}, 
                                n_jobs=2)

rf_rec.fit(X_train, y_train)

y_pred = rf_rec.predict(X_test)

print 'Confusion matrix:'
print_confusion_matrix(y_test, y_pred)
print '\nClassification report:'
print classification_report(y_test, y_pred, digits=5)


Confusion matrix:
	          prediction 0    prediction 1
	class 0      66,540          6,138
	class 1       1,899          4,287

Classification report:
             precision    recall  f1-score   support

          0    0.97225   0.91555   0.94305     72678
          1    0.41122   0.69302   0.51616      6186

avg / total    0.92825   0.89809   0.90956     78864



## Random Forest and Under-Sampling

The best parameters are:
#{'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200, 'max_depth': 6, 'class_weight': {1: 5}}



In [14]:
under_sample = RandomUnderSampler()
imb_x, imb_y = under_sample.fit_sample(X_train, y_train)

rf_us = RandomForestClassifier(max_features='sqrt',
                               min_samples_split=2, 
                               n_estimators=200, 
                               max_depth=6, 
                               class_weight={1:5}, 
                               n_jobs=2)

rf_us.fit(imb_x, imb_y)

y_pred = rf_us.predict(X_test)

print 'Confusion matrix:'
print_confusion_matrix(y_test, y_pred)
print '\nClassification report:'
print classification_report(y_test, y_pred, digits=5)


Confusion matrix:
	          prediction 0    prediction 1
	class 0      31,265         41,413
	class 1         110          6,076

Classification report:
             precision    recall  f1-score   support

          0    0.99649   0.43019   0.60094     72678
          1    0.12795   0.98222   0.22640      6186

avg / total    0.92837   0.47349   0.57156     78864



## Random Forest and Tomek-Links

best parameters = 
{'max_features': 40, 'min_samples_split': 2, 'n_estimators': 200, 'max_depth': 6, 'class_weight': {1: 5}}

In [15]:
tlk = TomekLinks()

tlk_x, tlk_y = tlk.fit_sample(X_train, y_train)

rf_tlk = RandomForestClassifier(n_estimators = 200, max_depth=6, 
                                max_features=40, min_samples_split=2, 
                                class_weight={1:5}, n_jobs=2)

rf_tlk.fit(tlk_x, tlk_y)

y_pred = rf_tlk.predict(X_test)

print 'Confusion matrix:'
print_confusion_matrix(y_test, y_pred)
print '\nClassification report:'
print classification_report(y_test, y_pred, digits=5)


Confusion matrix:
	          prediction 0    prediction 1
	class 0      66,331          6,347
	class 1       1,855          4,331

Classification report:
             precision    recall  f1-score   support

          0    0.97280   0.91267   0.94177     72678
          1    0.40560   0.70013   0.51364      6186

avg / total    0.92830   0.89600   0.90819     78864



## Random Forest and Over-Sampling

best parameters = 
'max_features': 40, 'min_samples_split': 2, 'n_estimators': 200, 'max_depth': None, 'class_weight': None

In [16]:
ros = RandomOverSampler()
ros_x, ros_y = ros.fit_sample(X_train, y_train)

rf_os = RandomForestClassifier(n_estimators = 200, max_depth=None, 
                               max_features=40, min_samples_split=2,
                               class_weight=None, n_jobs=2)

rf_os.fit(ros_x, ros_y)

y_pred = rf_os.predict(X_test)

print 'Confusion matrix:'
print_confusion_matrix(y_test, y_pred)
print '\nClassification report:'
print classification_report(y_test, y_pred, digits=5)


Confusion matrix:
	          prediction 0    prediction 1
	class 0      71,912            766
	class 1       4,622          1,564

Classification report:
             precision    recall  f1-score   support

          0    0.93961   0.98946   0.96389     72678
          1    0.67124   0.25283   0.36731      6186

avg / total    0.91856   0.93168   0.91710     78864

