## Random Forest Modeling using SMOTE, and PCA

In [103]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## week 3 imports
import missingno as msno     # msno.bar(titanic);  or msno.matrix(titanic);
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Linear and general modeling imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Feature Engineering
from sklearn.impute import SimpleImputer   # Imputation 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures   # Scale/transform/feature engineering

import patsy
# y, X = patsy.dmatrices(formula, data=diamonds, return_type='dataframe')

# GridSearch and Hyperparameter Tuning
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
########from imblearn.pipeline import Pipeline, make_pipeline

# Logistic and Classification metrics
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score, classification_report

# K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler

# naive bayes imports
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# SVMs
from sklearn.svm import LinearSVC, SVC

# Decision Trees
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

# Import Bagging, Boosting, and Random Forests, and ExtraTrees (Extremely Randomized Trees)
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor

# NLP imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# nltk.download()  --> Download all, and then restart jupyter lab
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, pos_tag
import re

import json


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [104]:
# imports SMOTE (oversampling) and undersampleing packages 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [105]:
# imports library for PCA
from sklearn.decomposition import PCA

In [106]:
# imports the data
df = pd.read_feather('../../data/h201.feather')
health_vars = pd.read_csv('../../data/Health Status Variables.csv')

In [107]:
# The cleaning function to filter df by age, and Unhappiness responses.
# Also turns unhappiness target variable into a binary
def clean_df(df):
    df = df[(df['AGELAST']>=5) & (df['AGELAST']<=17)]
    df = df[(df['UNHAP42']>=0) & (df['UNHAP42']<=4)]
    df['UNHAP42'] = df['UNHAP42'].map({0:0, 1:1, 2:1, 3:1, 4:1})
    return df

In [108]:
# Creates the child dataframe to be shaped for modeling
shaped_df = clean_df(df)

In [109]:
# Defines variable groups

# children with special health care needs screener variables
special_needs = list(health_vars['VARIABLE'][30:45])

# columbia impairment scale
impairment = list(health_vars['VARIABLE'][45:58])

# Consumer Assessment of Healthcare Providers and Systems (CAHPS)
cahps = list(health_vars['VARIABLE'][58:72])

# Physical features
phys = ['CHBMIX42', 'WHNPHY42']


# all children variables 
all_child = list(health_vars['VARIABLE'][30:97])

#########
# other identifier/demographic variables 
other = ['AGELAST', 'SEX', 'RACEV2X', 'FAMINC17', 'ADHDADDX']

In [110]:
# features of special needs children
child_feat = all_child + other

In [111]:
# Creates the dataframe with just the child related features 
child_df = shaped_df[child_feat]

In [112]:
# Sets up X and y
X = child_df.drop(columns='UNHAP42')
y = child_df['UNHAP42']

# X = shaped_df.drop(columns='UNHAP42')
# y = shaped_df['UNHAP42']

In [113]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

#### Pipeline
* Standard Scale
* SMOTE over sample
* SMOTE under sample
* PCA
* Random Forest Classifier

In [114]:
# Standard Scales 
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [115]:
# Uses SMOTE to over sample minority class and under sample majority class
# Instantiates SMOTE
smote = SMOTE(sampling_strategy=0.8, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.8, random_state=42)

# Resamples the data
X_train_sc_res, y_train_res = smote.fit_resample(X_train_sc, y_train)
X_train_sc_res, y_train_res = under.fit_resample(X_train_sc_res, y_train_res)

print('Resampled dataset shape %s' % Counter(y_train_res))

Resampled dataset shape Counter({0: 3267, 1: 2614})


In [116]:
# Uses PCA to reduce dimensionality 
# Instantiates PCA
# pca = PCA(n_components=20)
# Z_train = pca.fit_transform(X_train_res)
# Z_test = pca.transform(X_test_sc)

In [117]:
# Creates the Pipeline with PCA and Random Forest
pipe = Pipeline([
    ('pca', PCA()),
    ('rf', RandomForestClassifier(n_estimators=307, 
                                  max_depth=10, 
                                  min_samples_leaf=2,
                                  min_samples_split=3,
                                  random_state=42))
])

#### GridSearch to find the optimal number of components for PCA

In [127]:
# Creates the pipe parameters
pipe_params = {
    'pca__n_components' : range(1, 20, 4)
}

In [128]:
# Instantiates the gridsearch
gs = GridSearchCV(pipe,
                  param_grid=pipe_params,
                  cv=5
)

In [129]:
# Fits data to gridsearch 
gs.fit(X_train_sc_res, y_train_res)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('pca', PCA(n_components=71)),
                                       ('rf',
                                        RandomForestClassifier(max_depth=10,
                                                               min_samples_leaf=2,
                                                               min_samples_split=3,
                                                               n_estimators=307,
                                                               random_state=42))]),
             param_grid={'pca__n_components': range(1, 20, 4)})

In [130]:
# Evaluates gridsearch cross val score
gs.best_score_

0.8022474410324877

In [131]:
# Evaluates gridsearch accuracy scores 
gs.score(X_train_sc_res, y_train_res), gs.score(X_test_sc, y_test)

(0.9103893895595987, 0.7986486486486486)

In [132]:
# Finds the best params for the pca number of components
gs.best_params_

{'pca__n_components': 17}

#### Pipeline with best number of components for PCA

In [139]:
# Creates the Pipeline with PCA and Random Forest
pipe = Pipeline([
    ('pca', PCA(n_components=71)),
    ('rf', RandomForestClassifier(n_estimators=307, 
                                  max_depth=10, 
                                  min_samples_leaf=2,
                                  min_samples_split=3,
                                  random_state=42))
])

In [140]:
# Fits to the PCA and then Random Forest Model
pipe.fit(X_train_sc_res, y_train_res)
# pipe.fit(X_train_sc, y_train)

Pipeline(steps=[('pca', PCA(n_components=71)),
                ('rf',
                 RandomForestClassifier(max_depth=10, min_samples_leaf=2,
                                        min_samples_split=3, n_estimators=307,
                                        random_state=42))])

In [141]:
# Evaluates the accuracy scores
pipe.score(X_train_sc_res, y_train_res), pipe.score(X_test_sc, y_test)

(0.9488182281924843, 0.8081081081081081)