In [1]:
import pyreadr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
import imblearn
from sklearn.preprocessing import LabelEncoder
import fastai
from fastai import *
import fastai.tabular.all

In [None]:
data_file = '../5v_cleandf.rdata'
r_format = pyreadr.read_r(data_file)
df = r_format['df']  # this is the dataset
df.head().T

The dataset is too large to work with fast, so we can condense it only to the columns that we want.

In [None]:
columns = ['esi',
           'age',
           'gender',
           'ethnicity',
           'race',
           'lang',
           'religion',
           'maritalstatus',
           'employstatus',
           'insurance_status',
           'disposition',
           'arrivalmode',
           'previousdispo']

data = df.copy()[columns]

The dataset contains many `null` values, so we can simply remove any rows that contain a `null` value in it.

In [None]:
cols_in_dataframe = list(data.columns)

for col in cols_in_dataframe:
    rows_to_remove = []
    nulled_col = list(data[col].isnull())
    for i in range(len(nulled_col)):
        if nulled_col[i] == True:
            rows_to_remove.append(i)
    rows_to_remove.sort(reverse=True)
    
    data.drop(rows_to_remove, inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    print('Done condensing:', col)

We can label encode the categorical columns to make it easier to work with for the models that we create.

In [None]:
%%capture
cont = ['esi', 'age']
cat = [
    'gender',
    'ethnicity',
    'race',
    'lang',
    'religion',
    'maritalstatus',
    'employstatus',
    'insurance_status',
    'disposition',
    'arrivalmode',
    'previousdispo'
]

# Create individual label encoders for each column
transformers = {}
for col in cat:
    transformers[col] = LabelEncoder()

# fit each column with its corresponding label encoder
for col in cat:
    data[col] = data[[col]].apply(transformers[col].fit_transform)

The `esi` column is stored as strings, so we can convert it to numbers.

In [None]:
data['esi'] = data['esi'].astype(int)

Let's split the training and the testing data with the `sklearn` module.

In [None]:
data.columns

In [None]:
cols_to_train_on = [
    'esi', 'age', 'gender', 'ethnicity', 'race', 'lang', 'religion',
    'maritalstatus', 'employstatus', 'insurance_status', 'disposition',
    'arrivalmode'
]

col_dep_var = [ 'previousdispo' ]

In [None]:
from sklearn.model_selection import train_test_split

`X` is going to be the dataframe with the columns that we want to train on.  
`y` is going to be the dataframe with the column of the dependent variable.

In [None]:
X = data[cols_to_train_on]
y = data[col_dep_var]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

We don't need the individual training and testing variables for now; thus, we can consolidate them into training and testing datasets.

In [None]:
train = pd.DataFrame(X_train)
train['previousdispo'] = y_train
train.head()

In [None]:
test = pd.DataFrame(X_test)
test['previousdispo'] = y_test
test.head()

Now that the testing dataset is safe from further mutations, we can oversample the minorities in the training dataset.

In [None]:
balancing_col = 'race'
cols_without_balancing_col = [
    'esi', 'age', 'gender', 'ethnicity', 'lang', 'religion',
    'maritalstatus', 'employstatus', 'insurance_status', 'disposition',
    'arrivalmode', 'previousdispo'
]

In [None]:
a, b = train[cols_without_balancing_col], train[balancing_col]

In [None]:
from collections import Counter
Counter(b)

In [None]:
transformers['race'].inverse_transform([7, 6, 4, 2, 5, 1, 0, 3])

# RandomOverSampler

In [None]:
# from imblearn.over_sampling import RandomOverSampler
# sampler = RandomOverSampler(sampling_strategy='minority')

In [None]:
# a_resampled, b_resampled = sampler.fit_resample(a, b)
# a_resampled, b_resampled = sampler.fit_resample(a_resampled, b_resampled)
# a_resampled, b_resampled = sampler.fit_resample(a_resampled, b_resampled)
# a_resampled, b_resampled = sampler.fit_resample(a_resampled, b_resampled)

In [None]:
# a_resampled, b_resampled = sampler.fit_resample(a_resampled, b_resampled)
# a_resampled, b_resampled = sampler.fit_resample(a_resampled, b_resampled)

In [None]:
# a_resampled, b_resampled = sampler.fit_resample(a_resampled, b_resampled)

In [None]:
# Counter(b_resampled)

In [None]:
# a_resampled.shape, b_resampled.shape

In [None]:
# train = pd.DataFrame(a_resampled)
# train['race'] = b_resampled
# train.head()

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
sampler = SMOTE()

In [None]:
a_resampled, b_resampled = sampler.fit_resample(a, b)

In [None]:
Counter(b_resampled)

# Training

In [None]:
train = pd.DataFrame(a_resampled)
train['race'] = b_resampled

In [None]:
train

In [None]:
refined_cols_to_train_on = [
    'esi', 'gender', 'ethnicity', 'race', 'religion',
    'employstatus', 'insurance_status'
]
col_dep_var = 'previousdispo'

In [None]:
X_train = train[refined_cols_to_train_on]
y_train = train[col_dep_var]

In [None]:
X_test = test[refined_cols_to_train_on]
y_test = test[col_dep_var]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

We can now train models.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(300, verbose=2, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)
preds

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_test, preds, average = None)

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, preds)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

plot_labels = list(data['previousdispo'].unique())
plot_labels.sort()
plot_labels_transformed = transformers['previousdispo'].inverse_transform(plot_labels)
class_names = list(plot_labels_transformed)
tick_marks = np.arange(len(class_names)) + 0.5
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=90)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

Random Forest seems to give really terrible results.

In [None]:
import xgboost as xgb

In [None]:
classifier = xgb.sklearn.XGBClassifier(nthread=-1, seed=42)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print("Model Accuray: {:.2f}%".format(100*classifier.score(X_test, y_test)))

# LinearSVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

In [None]:
clf = make_pipeline(LinearSVC(random_state=42, tol=1e-5, verbose=True))

In [None]:
clf.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)
preds

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_test, preds, average = None)