In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler 
from collections import Counter

## Data Cleaning:
Data cleaning and preparation; admittedly, this is not the most efficient code -- but it will produce what we want.

In [2]:
#Read in dataframe
df = pd.read_csv("data_train.csv")

#Steps to clean data by dropping columns where the number of empty rows is >= 445,000
dfComplete = df.dropna(1, thresh= 445000)
# dfComplete.shape #Results in (494932,58)

#Steps to clean data by dropping row where the number of empty columns is > 0
dfCompleteAll = dfComplete.dropna(0, how="any")
# dfCompleteAll.shape #results in a dataframe of (476155,58)

# dfCompleteAll.isnull().sum() #no more nulls in the dataset

y = dfCompleteAll[["ASOURCE", "ATYPE", "RACE", "TOTCHG", "ZIPINC_QRTL"]]

In [3]:
"""The next step is to prepare variables for exploratory data analysis and feature
selection using a random forest. 

Random forest does not require standardization of continuous variables or normalization
of discrete variable. For categorical features, we will need to use pd.get_dummies or 
one hot encoding to create binary dummy variables. 
"""

#First create two dataframes of int and float values to make things easier to work with 
columnNames = dfCompleteAll.columns
dfFloat = pd.DataFrame()
dfInt = pd.DataFrame()
for name in columnNames:
    if dfCompleteAll[name].dtype == float:
        dfFloat = dfFloat.join(dfCompleteAll[name], how = "right")
    else:
        dfInt = dfInt.join(dfCompleteAll[name], how = "right")
        
#Convert all columns in DfFloat, except DISCWT, to integer values. Afterwards nominal features
#will be one-hot encoded to create dummy variables, again we will not normalize or stadardize
# numeric values. 

float_toInt = ['AGE', 'AMONTH', 'AWEEKEND', 'DIED', 'DISPUNIFORM', 'DXCCS1',
       'DXCCS2', 'FEMALE', 'LOS', 'PAY1', 'HOSP_BEDSIZE', 'HOSP_CONTROL',
       'HOSP_LOCTEACH']

for digit in float_toInt:
    dfFloat[digit] = dfFloat[digit].astype(int)
    
# In dfFloat we have the following columns and feature groupings. 
# For reference use feature_desc or call .unique() method on one of the columns

# CONTINUOUS:
dfFloatContinuous = dfFloat[["AGE", "DISCWT"]]


# NOMINAL:
dfFloatNominal = dfFloat[['AMONTH', 'DISPUNIFORM', 'DXCCS1',
       'DXCCS2', 'PAY1', 'HOSP_CONTROL','HOSP_LOCTEACH']]

# BINARY & ORDINAL
dfBinaryOrdinal = dfFloat[["DIED", "AWEEKEND", "FEMALE", "HOSP_BEDSIZE"]]

# DISCRETE
dfDiscrete = dfFloat[["LOS"]]

In [4]:
#Use pd.get_dummies to turn nominal variables into dummy variables by first setting all 
#values as string, a requirement of pd.get_dummies. 

dfFloatNominal = dfFloatNominal.loc[:].astype(str)
    
dfFloatNominal = pd.get_dummies(dfFloatNominal)
dfFloatNominal.shape 

(475155, 556)

In [5]:
# We can now recreate the original dfFloat dataframe as dfFloatPreprocessed which 
# will have variables ready for feature selection with RF. Next the same thing must be down
#with dfInt
list_of_dataframes = [dfFloatContinuous, dfBinaryOrdinal, dfDiscrete, dfFloatNominal]

dfFloatPreprocessed = pd.DataFrame()
for frame in list_of_dataframes:
    dfFloatPreprocessed = dfFloatPreprocessed.join(frame, how = "right")
dfFloatPreprocessed.shape

(475155, 563)

In [6]:
#Prepare dfInt for preprocessing, starting with dropping respone variables
dfInt = dfInt.drop(["ASOURCE", "ATYPE", "RACE", "TOTCHG", "ZIPINC_QRTL"], axis= 1)

#CMs are all binary, therefore, create a separate dataframe for these columns: 
dfCm = dfInt[['CM_AIDS', 'CM_ALCOHOL', 'CM_ANEMDEF',
       'CM_ARTH', 'CM_BLDLOSS', 'CM_CHF', 'CM_CHRNLUNG', 'CM_COAG',
       'CM_DEPRESS', 'CM_DM', 'CM_DMCX', 'CM_DRUG', 'CM_HTN_C', 'CM_HYPOTHY',
       'CM_LIVER', 'CM_LYMPH', 'CM_LYTES', 'CM_METS', 'CM_NEURO', 'CM_OBESE',
       'CM_PARA', 'CM_PERIVASC', 'CM_PSYCH', 'CM_PULMCIRC', 'CM_RENLFAIL',
       'CM_TUMOR', 'CM_ULCER', 'CM_VALVE', 'CM_WGHTLOSS']]

#Update the dfInt dataframe to a new dataframe:
dfIntShort = dfInt.drop(['CM_AIDS', 'CM_ALCOHOL', 'CM_ANEMDEF',
       'CM_ARTH', 'CM_BLDLOSS', 'CM_CHF', 'CM_CHRNLUNG', 'CM_COAG',
       'CM_DEPRESS', 'CM_DM', 'CM_DMCX', 'CM_DRUG', 'CM_HTN_C', 'CM_HYPOTHY',
       'CM_LIVER', 'CM_LYMPH', 'CM_LYTES', 'CM_METS', 'CM_NEURO', 'CM_OBESE',
       'CM_PARA', 'CM_PERIVASC', 'CM_PSYCH', 'CM_PULMCIRC', 'CM_RENLFAIL',
       'CM_TUMOR', 'CM_ULCER', 'CM_VALVE', 'CM_WGHTLOSS'],axis = 1)

#Update the dfInt dataframe to a new dataframe:
dfIntShort = dfInt.drop(['CM_AIDS', 'CM_ALCOHOL', 'CM_ANEMDEF',
       'CM_ARTH', 'CM_BLDLOSS', 'CM_CHF', 'CM_CHRNLUNG', 'CM_COAG',
       'CM_DEPRESS', 'CM_DM', 'CM_DMCX', 'CM_DRUG', 'CM_HTN_C', 'CM_HYPOTHY',
       'CM_LIVER', 'CM_LYMPH', 'CM_LYTES', 'CM_METS', 'CM_NEURO', 'CM_OBESE',
       'CM_PARA', 'CM_PERIVASC', 'CM_PSYCH', 'CM_PULMCIRC', 'CM_RENLFAIL',
       'CM_TUMOR', 'CM_ULCER', 'CM_VALVE', 'CM_WGHTLOSS'],axis = 1)

#Since NDX, NPR, ORPROC, TOTAL_DISC are all either Binary or Discerete variables, create a separate dataframe
dfIntBinaryDiscrete = dfIntShort[["NDX", "NPR", "ORPROC", "TOTAL_DISC"]]

# Since DQTR, HOSPID, MDC, NIS_STRATUM, HOSP_REGION are all nominal variables, create a separate dataframe to 
#turn these into dummy variables. ALSO Drop "KEY" as this is the record id field: 
dfIntToDummies = dfIntShort.drop(["KEY", "NDX", "NPR", "ORPROC", "TOTAL_DISC"], axis= 1)

#Turn values in DQTR, HOSPID, MDC, NIS_STRATUM, HOSP_REGION to string
dfIntToDummies = dfIntToDummies.loc[:].astype(str)

#Use pd.get_dummies to turn nominal string values to binary dummy variables
dfIntToDummies = pd.get_dummies(dfIntToDummies)
# dfIntToDummies.head()

intRecombine = [dfIntToDummies, dfCm, dfIntBinaryDiscrete]

dfIntPreprocessed = pd.DataFrame()
for df in intRecombine:
    dfIntPreprocessed = dfIntPreprocessed.join(df, how = "right")
    
dfPreprocessed = dfFloatPreprocessed.join(dfIntPreprocessed, how = "right")

In [7]:
#plug these dfs back together
dfFull = pd.concat([dfPreprocessed, y], axis=1)

## Check ASOURCE Variable Distribution

In [8]:
#first, separate our variables
X = dfFull.drop(['ASOURCE'], axis=1)
y = dfFull.ASOURCE

In [9]:
#take a look at distribution in this outcome variable
y.value_counts(normalize=True)

5    0.678164
1    0.255487
2    0.035982
3    0.030121
4    0.000246
Name: ASOURCE, dtype: float64

In [10]:
#yikes; get raw counts
y.value_counts()

5    322233
1    121396
2     17097
3     14312
4       117
Name: ASOURCE, dtype: int64

We need to change these sampling distributions. Ideally, we'd take a random undersample of our majority classes. But we don't even have enough class = 4 for that; we're going to have to make up some data.

## Resample Data

### Train/Test Split
- Must do this **before** oversampling, or else we'll experience information bleed.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 12)

Great. Now X_test and y_test will be completely held out. They are our real, observed data that the model hasn't seen. Time to mess with our sampling.

In [12]:
y_train.value_counts()

5    257836
1     97200
2     13641
3     11353
4        94
Name: ASOURCE, dtype: int64

We'll just oversample 4 by creating synthetic samples with the ADASYN method.

In [13]:
ada = ADASYN(n_jobs = -1,  ratio = {1:97200, 2:13641, 3:11353, 4:10000, 5:257836})
#The values of the dict passed to Ratio correspond to the desired number of samples.

X_train_rs, y_train_rs = ada.fit_sample(X_train, y_train)

In [19]:
new_counts = pd.Series(y_train_rs) #ADASYN package produces an ndarray
print(new_counts.value_counts(), "\n")
print(new_counts.value_counts(normalize=True))

5    257836
1     97200
2     13641
3     11353
4     10016
dtype: int64 

5    0.661040
1    0.249201
2    0.034973
3    0.029107
4    0.025679
dtype: float64


This will have to be good enough.

## Fit a Random Forest "base model"
- Before cross validation, we want to establish a base metric for performance.
- To start, we'll use parameters based on a tuned RF we fit to predict the RACE variable.
    - But, we'll set min_samples_split to 2 to start so our trees in this forest can "find" these smaller classes. 

In [21]:
clf = RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,min_samples_split=2,
                             min_samples_leaf=1,oob_score = True,random_state=12)

clf.fit(X_train_rs, y_train_rs)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False)

In [22]:
clf_predictions = clf.predict(X_test)
clf_accuracy = accuracy_score(y_test, clf_predictions)
clf_accuracy

0.94017741579063674

In [23]:
clf_report = classification_report(y_test, clf_predictions)
print(clf_report)

             precision    recall  f1-score   support

          1       0.92      0.97      0.94     24196
          2       0.80      0.64      0.71      3456
          3       0.72      0.35      0.47      2959
          4       0.00      0.00      0.00        23
          5       0.96      0.97      0.97     64397

avg / total       0.94      0.94      0.94     95031



In [24]:
pd.crosstab(y_test, clf_predictions, rownames=['Actual ASOURCE'], colnames=['Predicted ASOURCE'])

Predicted ASOURCE,1,2,3,4,5
Actual ASOURCE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23474,56,93,0,573
2,315,2207,58,0,876
3,616,97,1044,0,1202
4,16,0,1,0,6
5,1119,400,256,1,62621


This is a pretty sharp model to start, *except* for class 4. There are 23 observed class 4s in our entire test set of 95,031 observations. So...not sure how realistic it is for our model to find them. In the future, we may want to use anomaly detection models for class = 4.

## Model Tuning
- We want to see if we can improve these F1 scores.
- We'll have to get a smaller set of data so our computer can handle it. This will be a little tricky.
    - First, we'll take a random sample of our real, observed data.
    - Then, we'll impute some data in this new, smaller slice for our smallest minority class.

In [31]:
dfSample = dfFull.sample(n=25000, random_state=12)
small_X = dfSample.drop(['ASOURCE'], axis=1)
small_y = dfSample.ASOURCE

In [32]:
small_y.value_counts()

5    16958
1     6355
2      904
3      775
4        8
Name: ASOURCE, dtype: int64

In [33]:
#well, we are just going to have to create synthetic data for classes 2,3 and 4 and hope it doesn't affect our 
#tuning's applicability to a model that uses synthetic data only for class 4

new_oversample = ADASYN(n_jobs = -1,  ratio = {1:6355, 2:3000, 3:3000, 4:2000, 5:16958})

X_train_red, y_train_red = new_oversample.fit_sample(small_X, small_y)

In [34]:
param_grid = {
    'min_samples_split': [2, 5, 10, 25],
    'max_features': ['log2', 'auto', 100]
}

    
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='f1_micro', n_jobs=-1, 
                      return_train_score=False)

CV_clf.fit(X_train_red, y_train_red)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [2, 5, 10, 25], 'max_features': ['log2', 'auto', 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1_micro', verbose=0)

In [35]:
CV_clf.best_params_

{'max_features': 100, 'min_samples_split': 2}

As suspected, our ideal parameters are the ones we started with.

## Ideal Params Model on Raw Data for Comparison
- Just to make sure we're getting value from our data resample, let's run an model with these ideal params on the (very) unevenly distributed actual training data.

In [36]:
clf_comp = RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,min_samples_split=2,
                             min_samples_leaf=1,oob_score = True,random_state=12)

clf_comp.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False)

In [37]:
clf_comp_predictions = clf_comp.predict(X_test)
clf_comp_accuracy = accuracy_score(y_test, clf_comp_predictions)
clf_comp_accuracy

0.94009323273458134

In [38]:
clf_comp_report = classification_report(y_test, clf_comp_predictions)
print(clf_comp_report)

             precision    recall  f1-score   support

          1       0.92      0.97      0.94     24196
          2       0.80      0.63      0.71      3456
          3       0.72      0.35      0.47      2959
          4       0.00      0.00      0.00        23
          5       0.96      0.97      0.97     64397

avg / total       0.94      0.94      0.94     95031



There is **very little difference** between a model with oversampled data and the one with true data. The model with undersampled data does .01 better on recall with class = 2. This is very possibly due to random variation. 

If two models have identical performance, choose the simpler one. We choose this tuned Random Forest that uses the observed training data with NO resampling.

## Optimal Settings - Classification of Variable ASOURCE:

RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,min_samples_split=2,min_samples_leaf=1,oob_score = True,random_state=12)
                             
**Accuracy: ~94.00%**

             precision    recall  f1-score   support

          1       0.92      0.97      0.94     24196
          2       0.80      0.63      0.71      3456
          3       0.72      0.35      0.47      2959
          4       0.00      0.00      0.00        23
          5       0.96      0.97      0.97     64397

    avg / total       0.94      0.94      0.94     95031