## Imports 

In [1]:
#imports
import pandas as pd
import numpy as np
import random
import re
import recordlinkage
import time
import matplotlib.pyplot as plt

# ML imports 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer

# prevent depreciation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

## Read in, assign a unique identifier via the index, set to dates

In [2]:
# read in our PreMatrix csv from step A
# for violations 
preMatrix = pd.read_csv('../output/repMatrixforpredict_violations.csv').drop(columns=['Unnamed: 0'])

# for investigations 
#preMatrix = pd.read_csv('../output/repMatrixforpredict_investigations.csv').drop(columns=['Unnamed: 0'])

preMatrix.shape

preMatrix = preMatrix.reset_index().copy()
preMatrix = preMatrix.rename(columns={"index": 'unique_id'})
preMatrix.is_violator.value_counts()

## convert the dates to datetime objects
for col in ['CASE_RECEIVED_DATE', 'DECISION_DATE', 
            'REQUESTED_START_DATE_OF_NEED', 'REQUESTED_END_DATE_OF_NEED',
            'JOB_START_DATE', 'JOB_END_DATE']: 
    preMatrix[col] = pd.to_datetime(preMatrix[col])
    
preMatrix.columns
preMatrix.info()

(7643, 67)

0.0    6907
1.0     736
Name: is_violator, dtype: int64

Index(['unique_id', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_NAME',
       'AGENT_ATTORNEY_STATE', 'AGENT_POC_EMPLOYER_REP_BY_AGENT',
       'BASIC_NUMBER_OF_HOURS', 'BASIC_RATE_OF_PAY', 'BASIC_UNIT_OF_PAY',
       'CASE_NO', 'CASE_RECEIVED_DATE', 'CASE_STATUS', 'DECISION_DATE',
       'EDUCATION_LEVEL', 'EMPLOYER_ADDRESS1', 'EMPLOYER_ADDRESS2',
       'EMPLOYER_CITY', 'EMPLOYER_COUNTRY', 'EMPLOYER_NAME', 'EMPLOYER_PHONE',
       'EMPLOYER_PHONE_EXT', 'EMPLOYER_POSTAL_CODE', 'EMPLOYER_PROVINCE',
       'EMPLOYER_STATE', 'EMP_EXPERIENCE_REQD', 'EMP_EXP_NUM_MONTHS',
       'FULL_TIME', 'HOURLY_WORK_SCHEDULE_AM', 'HOURLY_WORK_SCHEDULE_PM',
       'JOB_END_DATE', 'JOB_IDNUMBER', 'JOB_START_DATE', 'JOB_TITLE',
       'LAWFIRM_NAME', 'MAJOR', 'NAICS_CODE', 'NAME_REQD_TRAINING',
       'NATURE_OF_TEMPORARY_NEED', 'NBR_WORKERS_CERTIFIED',
       'NBR_WORKERS_REQUESTED', 'NUM_MONTHS_TRAINING', 'ORGANIZATION_FLAG',
       'OTHER_EDU', 'OTHER_WORKSITE_LOCATION', 'OVERTIME_RATE_FROM',
       'OVERTI

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7643 entries, 0 to 7642
Data columns (total 68 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   unique_id                        7643 non-null   int64         
 1   AGENT_ATTORNEY_CITY              6037 non-null   object        
 2   AGENT_ATTORNEY_NAME              6037 non-null   object        
 3   AGENT_ATTORNEY_STATE             6036 non-null   object        
 4   AGENT_POC_EMPLOYER_REP_BY_AGENT  7637 non-null   object        
 5   BASIC_NUMBER_OF_HOURS            7557 non-null   float64       
 6   BASIC_RATE_OF_PAY                7643 non-null   float64       
 7   BASIC_UNIT_OF_PAY                7643 non-null   object        
 8   CASE_NO                          7643 non-null   object        
 9   CASE_RECEIVED_DATE               7643 non-null   datetime64[ns]
 10  CASE_STATUS                      7643 non-null   object     

In [3]:
# Second Diploma Major has no non null values so drop it 
preMatrix = preMatrix.drop(columns=['SECOND_DIPLOMA_MAJOR'])

In [4]:
# Assign the is_violator status to the y (value we are trying to predict)
y = list(preMatrix.is_violator)

# remove the is_violator status from the preMatrix ... because that would be too easy!
preMatrix = preMatrix.drop(columns=['is_violator'])

In [5]:
## dtypes auto-separate
## list of non-features

numeric_options = ["int64", "float64", "datetime64[ns]"]
num_cols = [one for one in preMatrix.columns if preMatrix.dtypes[one] in numeric_options]
cat_cols = [one for one in preMatrix.columns if preMatrix.dtypes[one] not in numeric_options]

print('Numeric Columns:')
print(num_cols)
print('\nCategorical Columns:')
print(cat_cols)


Numeric Columns:
['unique_id', 'BASIC_NUMBER_OF_HOURS', 'BASIC_RATE_OF_PAY', 'CASE_RECEIVED_DATE', 'DECISION_DATE', 'EMPLOYER_PHONE_EXT', 'EMP_EXP_NUM_MONTHS', 'JOB_END_DATE', 'JOB_START_DATE', 'NAICS_CODE', 'NBR_WORKERS_CERTIFIED', 'NBR_WORKERS_REQUESTED', 'NUM_MONTHS_TRAINING', 'OVERTIME_RATE_FROM', 'OVERTIME_RATE_TO', 'REQUESTED_END_DATE_OF_NEED', 'REQUESTED_START_DATE_OF_NEED', 'SUPERVISE_HOW_MANY', 'Unnamed: 0.1']

Categorical Columns:
['AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_STATE', 'AGENT_POC_EMPLOYER_REP_BY_AGENT', 'BASIC_UNIT_OF_PAY', 'CASE_NO', 'CASE_STATUS', 'EDUCATION_LEVEL', 'EMPLOYER_ADDRESS1', 'EMPLOYER_ADDRESS2', 'EMPLOYER_CITY', 'EMPLOYER_COUNTRY', 'EMPLOYER_NAME', 'EMPLOYER_PHONE', 'EMPLOYER_POSTAL_CODE', 'EMPLOYER_PROVINCE', 'EMPLOYER_STATE', 'EMP_EXPERIENCE_REQD', 'FULL_TIME', 'HOURLY_WORK_SCHEDULE_AM', 'HOURLY_WORK_SCHEDULE_PM', 'JOB_IDNUMBER', 'JOB_TITLE', 'LAWFIRM_NAME', 'MAJOR', 'NAME_REQD_TRAINING', 'NATURE_OF_TEMPORARY_NEED', 'ORGANIZATIO

In [6]:
# OLD USELESS CODE SAVED FOR POSTERITY...JUST IN CASE

# encoded_text_feature_pre = text_feature_pre.copy()
# for one in encoded_text_feature_pre.columns:
#     enc = LabelEncoder()
#     enc.fit(encoded_text_feature_pre[one].astype(str))
#     encoded_text_feature_pre[one] = enc.transform(encoded_text_feature_pre[one].astype(str))


In [7]:
# get the categorical features in one dataframe
cat_feature_pre = preMatrix.loc[:, cat_cols].copy()
print("Shape of non-imputed: ")
print(cat_feature_pre.shape)
# and the numerical features into another dataframe
num_feature_pre = preMatrix.loc[:, num_cols].copy()
print(num_feature_pre.shape)

# SimpleImputer on the categorical features and apply a "missing_value" to NANs 
imputer_cat = SimpleImputer(strategy='constant', fill_value='missing_value')
imputed_cat_feature_pre = pd.DataFrame(imputer_cat.fit_transform(cat_feature_pre))
imputed_cat_feature_pre.columns = cat_feature_pre.columns

# SimpleImputer on the numerical features and apply mode to NANs 
imputer_num = SimpleImputer(strategy='most_frequent', verbose=5)
imputed_num_feature_pre = pd.DataFrame(imputer_num.fit_transform(num_feature_pre))
imputed_num_feature_pre.columns = num_feature_pre.columns

print("Shape of imputed: ")
print(imputed_cat_feature_pre.shape)
print(imputed_num_feature_pre.shape)

# recombine the imputed cat and imputed num 

# we need to drop some columns which are going to be unique identifiers and could 
# be an issue within our model
unique_cols_to_drop = ['unique_id', 'CASE_NO', 'EMPLOYER_NAME', 'TRADE_NAME_DBA']
for l in [cat_cols, num_cols]: 
    for col in l: 
        if col in unique_cols_to_drop: 
            l.remove(col)

Shape of non-imputed: 
(7643, 47)
(7643, 19)
Shape of imputed: 
(7643, 47)
(7643, 19)


In [8]:
# prepare input data with OneHotEncoder
def prepare_inputs(X_train, X_test):
    oe = OneHotEncoder(handle_unknown='ignore')
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

imputed_combined = pd.merge(imputed_cat_feature_pre.reset_index(),
                            imputed_num_feature_pre.reset_index(), how='left', 
                            on='index')
print('%s rows lost in merge' %(imputed_num_feature_pre.shape[0]-imputed_combined.shape[0]))
print(imputed_combined.shape)
imputed_combined = imputed_combined.drop(columns = 'index')

# do a train test split 
# split into train and test sets (80/20)

# X_train, X_test, y_train, y_test = train_test_split(imputed_cat_feature_pre, y, test_size=0.20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(imputed_combined, y, test_size=0.20, random_state=3)

# apply the oneHotEcoder within prepare_inputs
X_train, X_test = prepare_inputs(X_train, X_test)

0 rows lost in merge
(7643, 67)


In [9]:
imputed_combined.head()

Unnamed: 0,AGENT_ATTORNEY_CITY,AGENT_ATTORNEY_NAME,AGENT_ATTORNEY_STATE,AGENT_POC_EMPLOYER_REP_BY_AGENT,BASIC_UNIT_OF_PAY,CASE_NO,CASE_STATUS,EDUCATION_LEVEL,EMPLOYER_ADDRESS1,EMPLOYER_ADDRESS2,...,NAICS_CODE,NBR_WORKERS_CERTIFIED,NBR_WORKERS_REQUESTED,NUM_MONTHS_TRAINING,OVERTIME_RATE_FROM,OVERTIME_RATE_TO,REQUESTED_END_DATE_OF_NEED,REQUESTED_START_DATE_OF_NEED,SUPERVISE_HOW_MANY,Unnamed: 0.1
0,Casper,Lisa Meyer,WY,Y,Month,H-300-17258-492669,DETERMINATION ISSUED - CERTIFICATION,,"5818 Sievers Road, Dixon CA 95620 (Physical Ad...","PO Box 807, Dixon CA 95620 (Mailing Address)",...,112410.0,5.0,5.0,0.0,10.38,0.0,2018-11-29,2017-12-01,25.0,1.0
1,missing_value,missing_value,missing_value,N,Month,H-300-17257-446860,DETERMINATION ISSUED - CERTIFICATION,,161 FIFTH AVENUE SOUTH,SUITE 100,...,112410.0,1.797688,1.797688,0.0,10.38,0.0,2019-01-06,2018-01-08,9.0,6514.859345
2,missing_value,missing_value,missing_value,N,Month,H-300-17257-446860,DETERMINATION ISSUED - CERTIFICATION,,35244 OIL CITY ROAD,missing_value,...,1119.0,1.666667,1.666667,0.0,10.38,0.0,2018-12-06,2017-12-08,25.0,5294.666667
3,CASPER,Lisa Meyer,WY,Y,Month,H-300-17262-816860,DETERMINATION ISSUED - CERTIFICATION,,"1600 Chevallier Dr, Wolf Creek MT 59648 (Physi...","PO Box 1683, Helena MT 59624 (Mailing Address)",...,112410.0,4.0,4.0,0.0,10.38,0.0,2018-12-30,2018-01-01,25.0,13.0
4,missing_value,missing_value,missing_value,N,Month,H-300-17256-339375,DETERMINATION ISSUED - CERTIFICATION,,425 RIVERVIEW AVE,missing_value,...,1119.0,2.666667,2.666667,0.0,10.38,0.0,2018-12-06,2017-12-08,25.0,4908.666667


In [10]:
clf = RandomForestClassifier(max_depth = None, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Confusion matrix \n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='binary')
    recall = recall_score(y_test, y_predicted, average='binary')
    f1 = f1_score(y_test, y_predicted, average='binary')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

Confusion matrix 

Predicted   0.0  1.0
Actual              
0.0        1389    4
1.0         131    5
accuracy = 0.912 
precision = 0.556 
recall = 0.037 
f1 = 0.069


In [11]:
'''
start_time = time.time()
importances = clf.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in clf.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")
print(importances)
forest_importances = pd.Series(importances, index=cat_cols)

fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(10)
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
'''

'\nstart_time = time.time()\nimportances = clf.feature_importances_\nstd = np.std([\n    tree.feature_importances_ for tree in clf.estimators_], axis=0)\nelapsed_time = time.time() - start_time\n\nprint(f"Elapsed time to compute the importances: "\n      f"{elapsed_time:.3f} seconds")\nprint(importances)\nforest_importances = pd.Series(importances, index=cat_cols)\n\nfig, ax = plt.subplots()\nfig.set_figheight(10)\nfig.set_figwidth(10)\nforest_importances.plot.bar(yerr=std, ax=ax)\nax.set_title("Feature importances using MDI")\nax.set_ylabel("Mean decrease in impurity")\nfig.tight_layout()\n'