# Import everything we need

In [1]:
import pandas as pd
import numpy as np
import pickle

# Set Pandas display options so we can see more data
pd.set_option('display.width', 1000)

In [2]:
# Reload the trained model
tlo_classifier_file = "models/tlo_lr_classifier_02.18.16.dat"

logClassifier = pickle.load(open(tlo_classifier_file, "rb"))
print(logClassifier)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=111, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


# Clean the new data

In [3]:
tlo_data_file = 'data/tlo_check_07_28_15_check_scores_anonymized.csv'
raw_data = pd.DataFrame.from_csv(tlo_data_file, 
                       header=0, 
                       sep=',', 
                       index_col=0, 
                       parse_dates=True, 
                       encoding=None, 
                       tupleize_cols=False, 
                       infer_datetime_format=True)
raw_data.head()

Unnamed: 0_level_0,full_name_check_value,last_name_check_value,ssn_score,dob_score,n1_score,n2_score,n3_score,n4_score,n5_score,n6_score,...,n10_score,n11_score,n12_score,n13_score,n14_score,verified,ssn_match,dob_match,name_match,failure_explanation
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17076,1,1,300,186,,,,,,,...,,,,,,0,1,0,1,DOB
16744,0,0,0,0,7.0,7.0,7.0,7.0,7.0,7.0,...,6.0,6.0,6.0,6.0,6.0,0,0,0,0,SSN DOB NAME
42421,1,1,300,225,,,,,,,...,,,,,,0,1,0,1,DOB
42641,1,1,266,263,,,,,,,...,,,,,,0,0,0,1,SSN DOB
43253,0,0,0,0,27.0,27.0,27.0,27.0,27.0,27.0,...,22.0,22.0,22.0,22.0,22.0,0,0,0,0,SSN DOB NAME


In [4]:
# Lowercase the text fields
raw_data['failure_explanation'] = raw_data['failure_explanation'].str.lower()

In [5]:
# Failure Explanations: 'dob', 'name', 'ssn dob name', 'ssn', 'ssn name', 'ssn dob','dob name', nan
def update_failure_explanations(type):
    if type == 'dob':
        return 0
    elif type == 'name':
        return 1
    elif type == 'ssn dob name':
        return 2
    elif type == 'ssn':
        return 3
    elif type == 'ssn name':
        return 4
    elif type == 'ssn dob':
        return 5
    elif type == 'dob name':
        return 6

In [6]:
# Convert all strings to numerics
raw_data['failure_explanation'] = raw_data['failure_explanation'].apply(update_failure_explanations)
raw_data.head()

Unnamed: 0_level_0,full_name_check_value,last_name_check_value,ssn_score,dob_score,n1_score,n2_score,n3_score,n4_score,n5_score,n6_score,...,n10_score,n11_score,n12_score,n13_score,n14_score,verified,ssn_match,dob_match,name_match,failure_explanation
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17076,1,1,300,186,,,,,,,...,,,,,,0,1,0,1,0
16744,0,0,0,0,7.0,7.0,7.0,7.0,7.0,7.0,...,6.0,6.0,6.0,6.0,6.0,0,0,0,0,2
42421,1,1,300,225,,,,,,,...,,,,,,0,1,0,1,0
42641,1,1,266,263,,,,,,,...,,,,,,0,0,0,1,5
43253,0,0,0,0,27.0,27.0,27.0,27.0,27.0,27.0,...,22.0,22.0,22.0,22.0,22.0,0,0,0,0,2


In [7]:
# Handle missing values
raw_data.fillna(0, inplace=True)
raw_data.head()

Unnamed: 0_level_0,full_name_check_value,last_name_check_value,ssn_score,dob_score,n1_score,n2_score,n3_score,n4_score,n5_score,n6_score,...,n10_score,n11_score,n12_score,n13_score,n14_score,verified,ssn_match,dob_match,name_match,failure_explanation
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17076,1,1,300,186,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
16744,0,0,0,0,7,7,7,7,7,7,...,6,6,6,6,6,0,0,0,0,2
42421,1,1,300,225,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
42641,1,1,266,263,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,5
43253,0,0,0,0,27,27,27,27,27,27,...,22,22,22,22,22,0,0,0,0,2


In [8]:
# Reorder the columns for splitting
# cols = list(raw_data)
# cols.insert(len(raw_data.columns)-1, cols.pop(cols.index('verified')))
# raw_data = raw_data.ix[:, cols]

cols = ['full_name_check_value',
        'ssn_score','dob_score',
        'n1_score',
        'n2_score',
        'n3_score',
        'n4_score',
        'n5_score',
        'n6_score',
        'n7_score',
        'n8_score',
        'n9_score',
        'n10_score',
        'n11_score',
        'n12_score',
        'n13_score',
        'n14_score',
        'ssn_match',
        'dob_match',
        'name_match',
        'failure_explanation',
        'last_name_check_value',
        'verified']
raw_data= raw_data[cols]
raw_data.head()

Unnamed: 0_level_0,full_name_check_value,ssn_score,dob_score,n1_score,n2_score,n3_score,n4_score,n5_score,n6_score,n7_score,...,n11_score,n12_score,n13_score,n14_score,ssn_match,dob_match,name_match,failure_explanation,last_name_check_value,verified
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17076,1,300,186,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
16744,0,0,0,7,7,7,7,7,7,7,...,6,6,6,6,0,0,0,2,0,0
42421,1,300,225,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
42641,1,266,263,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,1,0
43253,0,0,0,27,27,27,27,27,27,27,...,22,22,22,22,0,0,0,2,0,0


In [9]:
# Split the dataset between features and targets
tlo_data = raw_data.iloc[:,0:22].values
tlo_targets = raw_data['verified'].values

# Use the model to create the predictions

In [10]:
# tlo_data
# Make a prediction for each item in our data
for item in tlo_data:
    print(logClassifier.predict(item))

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[1]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[1]


