In [159]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler 
from collections import Counter

### Data Preparation and Cleaning
- Admittedly, this is not the most efficient code. But it gets the job done.

In [2]:
#Read in dataframe
df = pd.read_csv("data_train.csv")

In [3]:
#DATA CLEANING STEPS

#Steps to clean data by dropping columns where the number of empty rows is >= 445,000
dfComplete = df.dropna(1, thresh= 445000)
# dfComplete.shape #Results in (494932,58)

#Steps to clean data by dropping row where the number of empty columns is > 0
dfCompleteAll = dfComplete.dropna(0, how="any")
# dfCompleteAll.shape #results in a dataframe of (476155,58)

# dfCompleteAll.isnull().sum() #no more nulls in the dataset

In [4]:
#Extract Dataframe of Variables for Classification
y = dfCompleteAll[["ASOURCE", "ATYPE", "RACE", "TOTCHG", "ZIPINC_QRTL"]]
y.head()

Unnamed: 0,ASOURCE,ATYPE,RACE,TOTCHG,ZIPINC_QRTL
0,2,2,1,272123,3
1,2,2,1,209246,1
2,2,2,1,305474,1
3,2,2,1,202973,1
4,2,2,1,416072,1


In [5]:
"""The next step is to prepare variables for exploratory data analysis and feature
selection using a random forest. 

Random forest does not require standardization of continuous variables or normalization
of discrete variable. For categorical features, we will need to use pd.get_dummies or 
one hot encoding to create binary dummy variables. 
"""

#First create two dataframes of int and float values to make things easier to work with 
columnNames = dfCompleteAll.columns
dfFloat = pd.DataFrame()
dfInt = pd.DataFrame()
for name in columnNames:
    if dfCompleteAll[name].dtype == float:
        dfFloat = dfFloat.join(dfCompleteAll[name], how = "right")
    else:
        dfInt = dfInt.join(dfCompleteAll[name], how = "right")
        
#Convert all columns in DfFloat, except DISCWT, to integer values. Afterwards nominal features
#will be one-hot encoded to create dummy variables, again we will not normalize or stadardize
# numeric values. 

float_toInt = ['AGE', 'AMONTH', 'AWEEKEND', 'DIED', 'DISPUNIFORM', 'DXCCS1',
       'DXCCS2', 'FEMALE', 'LOS', 'PAY1', 'HOSP_BEDSIZE', 'HOSP_CONTROL',
       'HOSP_LOCTEACH']
for digit in float_toInt:
    dfFloat[digit] = dfFloat[digit].astype(int)

In [6]:
# In dfFloat we have the following columns and feature groupings. 
# For reference use feature_desc or call .unique() method on one of the columns

# CONTINUOUS:
dfFloatContinuous = dfFloat[["AGE", "DISCWT"]]


# NOMINAL:
dfFloatNominal = dfFloat[['AMONTH', 'DISPUNIFORM', 'DXCCS1',
       'DXCCS2', 'PAY1', 'HOSP_CONTROL','HOSP_LOCTEACH']]

# BINARY & ORDINAL
dfBinaryOrdinal = dfFloat[["DIED", "AWEEKEND", "FEMALE", "HOSP_BEDSIZE"]]

# DISCRETE
dfDiscrete = dfFloat[["LOS"]]

In [7]:
#Use pd.get_dummies to turn nominal variables into dummy variables by first setting all 
#values as string, a requirement of pd.get_dummies. 

dfFloatNominal = dfFloatNominal.loc[:].astype(str)
    
dfFloatNominal = pd.get_dummies(dfFloatNominal)
dfFloatNominal.shape 

(475155, 556)

In [8]:
#Normally here, we might turn values in dfFloatContinuous to normalized, however,
#DTs and RFs do not require this. Later if we use a different classifier, we will need
# to standardize or normalize. 

#Normalization: rescaling features to a range of [0, 1], a special case of min-max scaling.

#Standardization: Often more practical, center the feature columns at mean 0 with 
# standard deviation 1 so that the feature columns take the form of a normal distribution, 
#This make it easier to learn the weights.

In [9]:
# We can now recreate the original dfFloat dataframe as dfFloatPreprocessed which 
# will have variables ready for feature selection with RF. Next the same thing must be down
#with dfInt
list_of_dataframes = [dfFloatContinuous, dfBinaryOrdinal, dfDiscrete, dfFloatNominal]

dfFloatPreprocessed = pd.DataFrame()
for frame in list_of_dataframes:
    dfFloatPreprocessed = dfFloatPreprocessed.join(frame, how = "right")
dfFloatPreprocessed.shape

(475155, 563)

In [10]:
#Prepare dfInt for preprocessing, starting with dropping respone variables
dfInt = dfInt.drop(["ASOURCE", "ATYPE", "RACE", "TOTCHG", "ZIPINC_QRTL"], axis= 1)
'''
Uncomment the code below to look at values in each column to see what needs to be dropped 
'''

# columnNamesInt = dfInt.columns

# for name in columnNamesInt:
#     print(name, dfInt[name].unique()) #Any values where 0,1 or ordinal e.g., DQTR we are good

'\nUncomment the code below to look at values in each column to see what needs to be dropped \n'

In [11]:
#CMs are all binary, therefore, create a separate dataframe for these columns: 
dfCm = dfInt[['CM_AIDS', 'CM_ALCOHOL', 'CM_ANEMDEF',
       'CM_ARTH', 'CM_BLDLOSS', 'CM_CHF', 'CM_CHRNLUNG', 'CM_COAG',
       'CM_DEPRESS', 'CM_DM', 'CM_DMCX', 'CM_DRUG', 'CM_HTN_C', 'CM_HYPOTHY',
       'CM_LIVER', 'CM_LYMPH', 'CM_LYTES', 'CM_METS', 'CM_NEURO', 'CM_OBESE',
       'CM_PARA', 'CM_PERIVASC', 'CM_PSYCH', 'CM_PULMCIRC', 'CM_RENLFAIL',
       'CM_TUMOR', 'CM_ULCER', 'CM_VALVE', 'CM_WGHTLOSS']]

#Update the dfInt dataframe to a new dataframe:
dfIntShort = dfInt.drop(['CM_AIDS', 'CM_ALCOHOL', 'CM_ANEMDEF',
       'CM_ARTH', 'CM_BLDLOSS', 'CM_CHF', 'CM_CHRNLUNG', 'CM_COAG',
       'CM_DEPRESS', 'CM_DM', 'CM_DMCX', 'CM_DRUG', 'CM_HTN_C', 'CM_HYPOTHY',
       'CM_LIVER', 'CM_LYMPH', 'CM_LYTES', 'CM_METS', 'CM_NEURO', 'CM_OBESE',
       'CM_PARA', 'CM_PERIVASC', 'CM_PSYCH', 'CM_PULMCIRC', 'CM_RENLFAIL',
       'CM_TUMOR', 'CM_ULCER', 'CM_VALVE', 'CM_WGHTLOSS'],axis = 1)

# columnNamesInt = dfIntShort.columns
# for name in columnNamesInt:
#     print(name, dfIntShort[name].unique()) #Any values where 0,1 or ordinal e.g., DQTR we are good

In [12]:
#Continue evaluating values in each column

#Since NDX, NPR, ORPROC, TOTAL_DISC are all either Binary or Discerete variables, create a separate dataframe
dfIntBinaryDiscrete = dfIntShort[["NDX", "NPR", "ORPROC", "TOTAL_DISC"]]

# Since DQTR, HOSPID, MDC, NIS_STRATUM, HOSP_REGION are all nominal variables, create a separate dataframe to 
#turn these into dummy variables. ALSO Drop "KEY" as this is the record id field: 
dfIntToDummies = dfIntShort.drop(["KEY", "NDX", "NPR", "ORPROC", "TOTAL_DISC"], axis= 1)


In [13]:
#Turn values in DQTR, HOSPID, MDC, NIS_STRATUM, HOSP_REGION to string
dfIntToDummies = dfIntToDummies.loc[:].astype(str)

#Use pd.get_dummies to turn nominal string values to binary dummy variables
dfIntToDummies = pd.get_dummies(dfIntToDummies)
# dfIntToDummies.head()

In [14]:
#Recombinet the three dataframes into a new preprocessed dataframe called dfIntPreprocessed
intRecombine = [dfIntToDummies, dfCm, dfIntBinaryDiscrete]

dfIntPreprocessed = pd.DataFrame()
for df in intRecombine:
    dfIntPreprocessed = dfIntPreprocessed.join(df, how = "right")
    
# dfIntPreprocessed.shape

In [15]:
#Combine the now preprocessed dfInt and dfFloat dataframes back in a single dataframe of either discrete, continuous,
#or binary variables. 
dfPreprocessed = dfFloatPreprocessed.join(dfIntPreprocessed, how = "right")
arrayPreprocessed = np.array(dfPreprocessed)
dfPreprocessed.head() #these are our predictors

Unnamed: 0,AGE,DISCWT,DIED,AWEEKEND,FEMALE,HOSP_BEDSIZE,LOS,AMONTH_1,AMONTH_10,AMONTH_11,...,CM_PULMCIRC,CM_RENLFAIL,CM_TUMOR,CM_ULCER,CM_VALVE,CM_WGHTLOSS,NDX,NPR,ORPROC,TOTAL_DISC
0,48,4.671227,0,0,0,1,83,0,0,1,...,0,0,0,0,0,1,18,7,1,409
1,66,4.671227,0,0,0,1,50,0,0,1,...,0,0,0,0,0,1,18,8,0,409
2,53,4.671227,0,0,0,1,65,0,0,1,...,0,0,0,0,0,1,18,6,0,409
3,27,4.671227,0,0,1,1,59,0,0,1,...,0,0,0,0,0,1,18,4,0,409
4,48,4.671227,1,0,0,1,77,0,0,1,...,1,1,0,0,0,1,18,8,0,409


## Impute ZIPINC_QRTL

In [None]:
#before we do anything, we need to plug our DFs back together so other features can help predict race.
dfFull = pd.concat([dfPreprocessed, y], axis=1)

In [131]:
#get our X and y the same way we did with RACE
X_zip = dfFull.drop(['ZIPINC_QRTL'], axis=1) 
y_zip = dfFull.ZIPINC_QRTL

#train test split before we fudge (er, undersample) our data

X_train_zip, X_test_zip, y_train_zip, y_test_zip = train_test_split(X_zip, y_zip, test_size = 0.25, random_state = 12)

**Again, we fit a "base" model before playing around with any of our data.**
- This time, *use optimized settings for imputing race* (seems as good a starting point as any).

In [132]:
zq_clf = RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,min_samples_split=5,
                             min_samples_leaf=1,oob_score = True, random_state=12)

zq_clf.fit(X_train_zip, y_train_zip)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False)

In [133]:
zq_clf_predictions = zq_clf.predict(X_test_zip)
zq_clf_accuracy = accuracy_score(y_test_zip, zq_clf_predictions)
zq_clf_accuracy

0.62227142243810452

In [134]:
zq_clf_report = classification_report(y_test_zip, zq_clf_predictions)
print(zq_clf_report)

             precision    recall  f1-score   support

          1       0.67      0.71      0.69     30534
          2       0.51      0.42      0.46     20892
          3       0.50      0.41      0.45     27473
          4       0.69      0.80      0.74     39890

avg / total       0.61      0.62      0.61    118789



In [135]:
pd.crosstab(y_test_zip, zq_clf_predictions, rownames=['Actual Zip Inc Qrtl'], colnames=['Predicted Zip Inc Qrtl'])

Predicted Race,1,2,3,4
Actual Race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,21723,3511,2996,2304
2,5082,8845,3928,3037
3,3700,3409,11373,8991
4,2047,1611,4254,31978


Honestly, not a bad model to start. Based on previous trial and error with this variable (in separate notebooks), this model is benefitting alot from optizimized settings.

Still, let's see if we can improve it.

In [136]:
#observed variable counts
y_train_zip.value_counts()

4    118800
1     92350
3     82260
2     62956
Name: ZIPINC_QRTL, dtype: int64

In [137]:
#as expected from our F1 scores, 2 and 3 are under-represented.
#let's just make all our classes equal in a dataset of 24,000 (we are going to cross val, so can't go larger)
zip_rs = RandomUnderSampler(ratio = {1:6000, 2:6000, 3:6000, 4:6000}, random_state=12)

#this time, ratio = auto means resample all except minority class
uX_train_zip, uy_train_zip = zip_rs.fit_sample(X_train_zip, y_train_zip)

In [139]:
#we'll try a wider param grid
zip_param_grid = {
    'min_samples_split': [2, 5, 10, 25],
    'max_features': ['log2', 'auto', 100]
}

#zq_clf = RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,min_samples_split=5,
                             #min_samples_leaf=1,oob_score = True, random_state=12)
    
CV_zip = GridSearchCV(estimator=zq_clf, param_grid=zip_param_grid, scoring='f1_micro', n_jobs=-1,
                        return_train_score=False)

CV_zip.fit(uX_train_zip, uy_train_zip)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [2, 5, 10, 25], 'max_features': ['log2', 'auto', 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1_micro', verbose=0)

Let's make predictions with this GridSearchCV object (which was trained on the resampled data). Then we'll compare this model to one with the same ("ideal") parameters on the raw training data:

In [140]:
CV_zip_predictions = CV_zip.predict(X_test_zip)
CV_zip_accuracy = accuracy_score(y_test_zip, CV_zip_predictions)
CV_zip_accuracy

0.59380919108671681

In [141]:
CV_zip_report = classification_report(y_test_zip, CV_zip_predictions)
print(CV_zip_report)

             precision    recall  f1-score   support

          1       0.67      0.66      0.67     30534
          2       0.43      0.52      0.47     20892
          3       0.46      0.42      0.44     27473
          4       0.74      0.70      0.72     39890

avg / total       0.60      0.59      0.60    118789



In [157]:
pd.crosstab(y_test_zip, CV_zip_predictions, rownames=['Actual Zip Inc Qrtl'], colnames=['Predicted Zip Inc Qrtl'])

Predicted Zip Inc Qrtl,1,2,3,4
Actual Zip Inc Qrtl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,20218,5602,3320,1394
2,4377,10930,3861,1724
3,3362,5628,11660,6823
4,2194,3441,6525,27730


We don't like these results that much; pretty big hit on accuracy (3%) and f-1 scores are no better in minority classes and worse in majority classes. Here, data resampling hasn't helped.

### ZIPINC_QRTL Cross Validation on Normal Data
We didn't pick up much from resampling the data -- probably because these classes have a decent distribution already. So let's cross validate on a 25,000 sample of the normal data.

In [146]:
zip_slice = dfFull.sample(n=25000)
X_zip_red = zip_slice.drop(['ZIPINC_QRTL'], axis=1)
y_zip_red = zip_slice.ZIPINC_QRTL

X_zip_train_red, X_zip_test_red, y_zip_train_red, y_zip_test_red = train_test_split(X_zip_red, y_zip_red,
                                                                                   test_size = 0.25, random_state=12)

In [147]:
#we'll try a wider param grid
zip_param_grid = {
    'min_samples_split': [2, 5, 10, 25],
    'max_features': ['log2', 'auto', 100]
}

#zq_clf = RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,min_samples_split=5,
                             #min_samples_leaf=1,oob_score = True, random_state=12)
    
CV_zip_samp = GridSearchCV(estimator=zq_clf, param_grid=zip_param_grid, scoring='f1_micro', n_jobs=-1,
                        return_train_score=False)

CV_zip_samp.fit(X_zip_train_red, y_zip_train_red)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [2, 5, 10, 25], 'max_features': ['log2', 'auto', 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1_micro', verbose=0)

In [148]:
#check best params
CV_zip_samp.best_params_

{'max_features': 100, 'min_samples_split': 25}

I am wary about the best min samples being 25. We may be ramping up F1 scores for common classes. Let's find out:

In [149]:
CV_zip_samp_predictions = CV_zip_samp.predict(X_test_zip)
CV_zip_samp_accuracy = accuracy_score(y_test_zip, CV_zip_samp_predictions)
CV_zip_samp_accuracy

0.61141183106179864

In [151]:
CV_zip_samp_report = classification_report(y_test_zip, CV_zip_samp_predictions)
print(CV_zip_samp_report)

             precision    recall  f1-score   support

          1       0.64      0.72      0.67     30534
          2       0.50      0.39      0.44     20892
          3       0.50      0.40      0.44     27473
          4       0.68      0.79      0.73     39890

avg / total       0.60      0.61      0.60    118789



**This model is WORSE than our "base" model - but it also saw much less of our data** (since we had to reduce observations AND it had to be cross validated)

### Use "Ideal" Params to Fit Model with Full Train Data:
- This should be at least as good as our base model. If not, then using only 25k rows to cross validate is just giving us ideal params for 25k rows of data, not the full dataset.

In [152]:
zip_model_tuned = RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,
                                         min_samples_split=25,min_samples_leaf=1,oob_score = True, random_state=12)

zip_model_tuned.fit(X_train_zip, y_train_zip)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=250, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=12, verbose=0, warm_start=False)

In [153]:
zip_model_tuned_predictions = zip_model_tuned.predict(X_test_zip)
zip_model_tuned_accuracy = accuracy_score(y_test_zip, zip_model_tuned_predictions)
zip_model_tuned_accuracy

0.62305432321174514

In [155]:
zip_model_report = classification_report(y_test_zip, zip_model_tuned_predictions)
print("'Tuned' Model Results")
print(zip_model_report)

'Tuned' Model Results
             precision    recall  f1-score   support

          1       0.67      0.71      0.69     30534
          2       0.51      0.42      0.46     20892
          3       0.51      0.41      0.45     27473
          4       0.69      0.81      0.74     39890

avg / total       0.61      0.62      0.61    118789



**"base" model results**

             precision    recall  f1-score   support

          1       0.67      0.71      0.69     30534
          2       0.51      0.42      0.46     20892
          3       0.50      0.41      0.45     27473
          4       0.69      0.80      0.74     39890

    avg / total       0.61      0.62      0.61    118789


In [156]:
pd.crosstab(y_test_zip, zip_model_tuned_predictions, 
            rownames=['Actual Zip Inc Qrtl'], colnames=['Predicted Zip Inc Qrtl'])

Predicted Zip Inc Qrtl,1,2,3,4
Actual Zip Inc Qrtl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,21759,3493,2989,2293
2,5111,8829,3904,3048
3,3733,3359,11232,9149
4,2040,1568,4090,32192


We see the narrowest of improvements with the tuned model - .01 pickup on precision in class 3 and .01 pickup on recall for class 4. This may just be random chance. Nonetheless, **it uses the full, unaltered training data, and has slighly lower-variance settings. We consider it our optimized model.**

## Optimal Settings - Classification of Variable ZIPINC_QRTL:

RandomForestClassifier(n_jobs=-1,max_features=100,n_estimators=300,max_depth=250,
                                         min_samples_split=25,min_samples_leaf=1,oob_score = True, random_state=12)
                             
**Accuracy: ~62.30%**

**'Tuned' Model Results**

            precision    recall  f1-score   support

          1       0.67      0.71      0.69     30534
          2       0.51      0.42      0.46     20892
          3       0.51      0.41      0.45     27473
          4       0.69      0.81      0.74     39890

    avg / total       0.61      0.62      0.61    118789