### Trying Imputation with MICE on Random Forest Model

In [2]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## week 3 imports
import missingno as msno     # msno.bar(titanic);  or msno.matrix(titanic);
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Linear and general modeling imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Feature Engineering
from sklearn.impute import SimpleImputer   # Imputation 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures   # Scale/transform/feature engineering

import patsy
# y, X = patsy.dmatrices(formula, data=diamonds, return_type='dataframe')

# GridSearch and Hyperparameter Tuning
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
########from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline, make_pipeline

# Logistic and Classification metrics
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score, classification_report

# K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler

# naive bayes imports
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# SVMs
from sklearn.svm import LinearSVC, SVC

# Decision Trees
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

# Import Bagging, Boosting, and Random Forests, and ExtraTrees (Extremely Randomized Trees)
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor

# NLP imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# nltk.download()  --> Download all, and then restart jupyter lab
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, pos_tag
import re

import json


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [16]:
# !pip install impyute

Collecting impyute
  Downloading impyute-0.0.8-py2.py3-none-any.whl (31 kB)
Installing collected packages: impyute
Successfully installed impyute-0.0.8


In [171]:
# imports SMOTE (oversampling) and undersampleing packages 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# imports mice
from impyute.imputation.cs import mice

In [5]:
# Loads in the data
df = pd.read_feather('../../data/h201.feather')
df.head()

Unnamed: 0,DUID,PID,DUPERSID,PANEL,FAMID31,FAMID42,FAMID53,FAMID17,FAMIDYR,CPSFAMID,...,RXPTR17,RXOTH17,PERWT17F,FAMWT17F,FAMWT17C,SAQWT17F,DIABW17F,CSAQW17F,VARSTR,VARPSU
0,10001.0,101.0,10001101,21.0,A,A,A,A,A,A,...,2506.0,0.0,13494.959896,13651.501535,13651.501535,18363.716686,0.0,0.0,1021.0,1.0
1,10001.0,102.0,10001102,21.0,A,A,A,A,A,A,...,0.0,0.0,12031.802435,13651.501535,13651.501535,14279.941801,0.0,0.0,1021.0,1.0
2,10001.0,103.0,10001103,21.0,A,A,A,A,A,A,...,135.0,0.0,12308.91898,13651.501535,13651.501535,0.0,0.0,0.0,1021.0,1.0
3,10001.0,104.0,10001104,21.0,A,A,A,A,A,A,...,0.0,0.0,12280.755977,13651.501535,13651.501535,0.0,0.0,0.0,1021.0,1.0
4,10002.0,101.0,10002101,21.0,A,A,A,A,A,A,...,0.0,0.0,6596.64055,7113.635349,7113.635349,7427.265851,0.0,0.0,1077.0,1.0


In [6]:
# Loads in health status variables dataframe of variable descriptions
deps = pd.read_csv('../../data/Health Status Variables.csv')
deps.head()

Unnamed: 0,VARIABLE,DESCRIPTION,SOURCE,Activate,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49
0,IADLHP31,IADL Screener – RD 3/1,HE 1-3,0.0,,,,,,,...,,,,,,,,,,
1,ADLHLP31,ADL Screener – RD 3/1,HE 4-6,0.0,,,,,,,...,,,,,,,,,,
2,AIDHLP31,Used Assistive Devices – RD 3/1,HE 7-8,1.0,,,,,,,...,,,,,,,,,,
3,WLKLIM31,Limitation in Physical Functioning – RD 3/1,HE 9-10,1.0,,,,,,,...,,,,,,,,,,
4,LFTDIF31,Difficulty Lifting 10 Pounds – RD 3/1,HE 11,0.0,,,,,,,...,,,,,,,,,,


In [7]:
# Subsets the health status variables for the ones that pertain to children
child_vars = list(deps['VARIABLE'][30:97])
# child_vars

In [8]:
# Creates a list of other variables we want to add
other_vars = ['AGELAST']

In [9]:
# Combines the child variables with additional other variables we want to add
child_vars = child_vars + other_vars
child_vars

['CHPMED42',
 'CHPMHB42',
 'CHPMCN42',
 'CHSERV42',
 'CHSRHB42',
 'CHSRCN42',
 'CHLIMI42',
 'CHLIHB42',
 'CHLICO42',
 'CHTHER42',
 'CHTHHB42',
 'CHTHCO42',
 'CHCOUN42',
 'CHEMPB42',
 'CSHCN42',
 'MOMPRO42',
 'DADPRO42',
 'UNHAP42',
 'SCHLBH42',
 'HAVFUN42',
 'ADUPRO42',
 'NERVAF42',
 'SIBPRO42',
 'KIDPRO42',
 'SPRPRO42',
 'SCHPRO42',
 'HOMEBH42',
 'TRBLE42',
 'CHILCR42',
 'CHILWW42',
 'CHRTCR42',
 'CHRTWW42',
 'CHAPPT42',
 'CHNDCR42',
 'CHENEC42',
 'CHLIST42',
 'CHEXPL42',
 'CHRESP42',
 'CHPRTM42',
 'CHHECR42',
 'CHSPEC42',
 'CHEYRE42',
 'MESHGT42',
 'WHNHGT42',
 'MESWGT42',
 'WHNWGT42',
 'CHBMIX42',
 'MESVIS42',
 'MESBPR42',
 'WHNBPR42',
 'DENTAL42',
 'WHNDEN42',
 'EATHLT42',
 'WHNEAT42',
 'PHYSCL42',
 'WHNPHY42',
 'SAFEST42',
 'WHNSAF42',
 'BOOST42',
 'WHNBST42',
 'LAPBLT42',
 'WHNLAP42',
 'HELMET42',
 'WHNHEL42',
 'NOSMOK42',
 'WHNSMK42',
 'TIMALN42',
 'AGELAST']

In [10]:
# Filters the age of children between 5-17
child_df = df.loc[(df['AGELAST'] <= 17) & (df['AGELAST'] >= 5), child_vars]
child_df

Unnamed: 0,CHPMED42,CHPMHB42,CHPMCN42,CHSERV42,CHSRHB42,CHSRCN42,CHLIMI42,CHLIHB42,CHLICO42,CHTHER42,...,BOOST42,WHNBST42,LAPBLT42,WHNLAP42,HELMET42,WHNHEL42,NOSMOK42,WHNSMK42,TIMALN42,AGELAST
2,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,-1.0,-1.0,1.0,1.0,2.0,-1.0,2.0,-1.0,2.0,17.0
3,1.0,1.0,1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,-1.0,-1.0,2.0,-1.0,2.0,-1.0,2.0,-1.0,2.0,14.0
9,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,1.0,3.0,-1.0,-1.0,1.0,1.0,2.0,-1.0,-1.0,10.0
13,1.0,1.0,1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,-1.0,-1.0,2.0,-1.0,2.0,-1.0,2.0,-1.0,2.0,12.0
14,1.0,1.0,1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,2.0,-1.0,-1.0,-1.0,2.0,-1.0,2.0,-1.0,-1.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31867,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,2.0,-1.0,-1.0,-1.0,2.0,-1.0,1.0,2.0,-1.0,6.0
31870,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,1.0,3.0,-1.0,-1.0,1.0,3.0,1.0,3.0,-1.0,9.0
31871,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,1.0,3.0,-1.0,-1.0,1.0,3.0,1.0,3.0,-1.0,5.0
31878,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,...,-1.0,-1.0,2.0,-1.0,2.0,-1.0,2.0,-1.0,1.0,13.0


In [11]:
# Checks the baseline/class balance of target variable
child_df['UNHAP42'].value_counts()

 0.0     4358
 1.0      879
 2.0      499
-1.0      195
 3.0      119
 4.0       65
-7.0       19
-8.0        8
-9.0        7
 99.0       6
Name: UNHAP42, dtype: int64

In [12]:
# filters the target variable to be between 0 and 4
hap_filter =(child_df['UNHAP42'] >= 0) & (child_df['UNHAP42'] <= 4)
child_df = child_df[hap_filter]

In [13]:
# Re-assigns the response variable into binary classes
child_df['UNHAP42'] = child_df['UNHAP42'].map({0:0, 1:1, 2:1, 3:1, 4:1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child_df['UNHAP42'] = child_df['UNHAP42'].map({0:0, 1:1, 2:1, 3:1, 4:1})


In [14]:
# Rechecks the balance/baseline of the target variable
child_df['UNHAP42'].value_counts(normalize=True)

0    0.736149
1    0.263851
Name: UNHAP42, dtype: float64

### Modeling with MICE

In [156]:
# Sets up X and y
X = child_df.drop(columns='UNHAP42')
y = child_df['UNHAP42']

In [157]:
X.shape

(5920, 67)

In [158]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    random_state=42)

In [159]:
X_train.shape

(4440, 67)

In [160]:
# Creates the new train dataset with nulls
new_X_train = X_train[X_train > 0]
new_X_test = X_test[X_test > 0]

In [161]:
new_X_train.isnull().sum()

CHPMED42       5
CHPMHB42    3602
CHPMCN42    3663
CHSERV42      12
CHSRHB42    3946
            ... 
WHNHEL42    2426
NOSMOK42      77
WHNSMK42    2005
TIMALN42    2542
AGELAST        0
Length: 67, dtype: int64

In [162]:
# Drops columns that have less than 4400 non-null values
new_X_train.dropna(axis=1, thresh=4400, inplace=True)
new_X_test = new_X_test[new_X_train.columns]

In [163]:
new_X_train.shape

(4440, 10)

In [164]:
new_X_train.isnull().sum()

CHPMED42     5
CHSERV42    12
CHLIMI42     8
CHTHER42     5
CHCOUN42     4
CSHCN42      0
CHILCR42     8
CHRTCR42    25
CHSPEC42    14
AGELAST      0
dtype: int64

In [165]:
# set(X_train.columns).difference(set(new_X_train.columns))

{'ADUPRO42',
 'BOOST42',
 'CHAPPT42',
 'CHBMIX42',
 'CHEMPB42',
 'CHENEC42',
 'CHEXPL42',
 'CHEYRE42',
 'CHHECR42',
 'CHILWW42',
 'CHLICO42',
 'CHLIHB42',
 'CHLIST42',
 'CHNDCR42',
 'CHPMCN42',
 'CHPMHB42',
 'CHPRTM42',
 'CHRESP42',
 'CHRTWW42',
 'CHSRCN42',
 'CHSRHB42',
 'CHTHCO42',
 'CHTHHB42',
 'DADPRO42',
 'DENTAL42',
 'EATHLT42',
 'HAVFUN42',
 'HELMET42',
 'HOMEBH42',
 'KIDPRO42',
 'LAPBLT42',
 'MESBPR42',
 'MESHGT42',
 'MESVIS42',
 'MESWGT42',
 'MOMPRO42',
 'NERVAF42',
 'NOSMOK42',
 'PHYSCL42',
 'SAFEST42',
 'SCHLBH42',
 'SCHPRO42',
 'SIBPRO42',
 'SPRPRO42',
 'TIMALN42',
 'TRBLE42',
 'WHNBPR42',
 'WHNBST42',
 'WHNDEN42',
 'WHNEAT42',
 'WHNHEL42',
 'WHNHGT42',
 'WHNLAP42',
 'WHNPHY42',
 'WHNSAF42',
 'WHNSMK42',
 'WHNWGT42'}

In [166]:
# start the MICE training, by transforming X_train
X_train_imp = mice(new_X_train.values)

In [167]:
# Mice training, transforming/filling nulls in X_test set
X_test_imp = mice(new_X_test.values)

In [168]:
# Instantiates a Random Forest pruned version. 
rf_pruned = RandomForestClassifier(max_depth=10,
                                   min_samples_leaf=2,
                                   min_samples_split=3,
                                   n_estimators=307,
                                   random_state=42)

In [169]:
# Fits to the pruned random forest
rf_pruned.fit(X_train_imp, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=3,
                       n_estimators=307, random_state=42)

In [170]:
# Evaluates the pruned random forest
rf_pruned.score(X_train_imp, y_train), rf_pruned.score(X_test_imp, y_test)

(0.8065315315315316, 0.7804054054054054)

#### Try with SMOTE the imputed data

In [172]:
# Instantiates the pipeline for using SMOTE
pipe = Pipeline([
    ('over', SMOTE(random_state=42)), 
    ('under',RandomUnderSampler(random_state=42)), 
    ('rf', RandomForestClassifier(n_estimators=307, 
                                  max_depth=10, 
                                  min_samples_leaf=2,
                                  min_samples_split=3,
                                  random_state=42))
])

In [173]:
# pipe params
pipe_params = {'over__sampling_strategy':[0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
               'under__sampling_strategy' : [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  
              }

In [174]:
# Instantiates the gridsearch
gs = GridSearchCV(pipe, 
                  param_grid=pipe_params,
                  cv=3, 
                  verbose=1,
                  n_jobs=4)

In [175]:
# Fits to the gridsearch
gs.fit(X_train_imp, y_train)

Fitting 3 folds for each of 49 candidates, totalling 147 fits


 0.73130631        nan 0.78153153 0.775      0.7731982  0.76058559
 0.75067568 0.74099099        nan        nan 0.77725225 0.77297297
 0.76846847 0.76103604 0.7509009         nan        nan        nan
 0.775      0.76846847 0.76441441 0.75540541        nan        nan
        nan        nan 0.76891892 0.76351351 0.76013514        nan
        nan        nan        nan        nan 0.76554054 0.75968468
        nan        nan        nan        nan        nan        nan
 0.76103604]


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('over', SMOTE(random_state=42)),
                                       ('under',
                                        RandomUnderSampler(random_state=42)),
                                       ('rf',
                                        RandomForestClassifier(max_depth=10,
                                                               min_samples_leaf=2,
                                                               min_samples_split=3,
                                                               n_estimators=307,
                                                               random_state=42))]),
             n_jobs=4,
             param_grid={'over__sampling_strategy': [0.4, 0.5, 0.6, 0.7, 0.8,
                                                     0.9, 1.0],
                         'under__sampling_strategy': [0.4, 0.5, 0.6, 0.7, 0.8,
                                                      0.9, 1.0]},
             ver

In [176]:
# Evaluates the grid search's accuracy
gs.score(X_train_imp, y_train), gs.score(X_test_imp, y_test)

(0.8058558558558558, 0.777027027027027)

In [177]:
# Best cross val gridsearch score
gs.best_score_

0.7822072072072072

In [178]:
# Best params
gs.best_params_

{'over__sampling_strategy': 0.4, 'under__sampling_strategy': 0.4}