# Modeling

* baseline model
* time series regression analysis - rmse
* random forest/decision tree regression
* knn
* SVM
* gradient boosted trees
* xgboost

Precision = True Positive / Actual Results (false positives)

Recall = True Positive / Predicted Results (false negatives)

Accuracy = (True Positive + True Negative) / Total

F1 Score = harmonic mean of precision and recall

## Import Libraries

In [1]:
#Import necessary Libraries

import pandas as pd
import numpy as np

#Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Machine Learning
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')




In [None]:
#For reproducibility 
seed = 42

## Prepare Data

In [None]:
X = df.drop(['aqi'], axis=1)
y = df['aqi']

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Decision Tree

In [None]:
# Instantiate and fit a DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=seed) tree_clf.fit(X_train, y_train)

In [None]:
# Cross validation
tree_cv_score = cross_val_score(tree_clf, X_train, y_train, cv=3)
mean_tree_cv_score = np.mean(tree_cv_score)

print(f"Mean Cross Validation Score: {mean_tree_cv_score :.2%}")

In [None]:
X.columns

In [None]:
def plot_feature_importances(model, title='Feature Importance for Model'):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.title(title)

In [3]:
plot_feature_importances(tree_clf, "Feature Importance for Decision Tree")

In [None]:
# Test set predictions
pred = tree_clf.predict(X_test)

# Confusion matrix and classification report
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(y_test, pred) * 100))

In [None]:
# Instantiate a BaggingClassifier
bagged_tree =  BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), 
                                 n_estimators=20, random_state=seed)

In [None]:
# Fit to the training data
bagged_tree.fit(X_train, y_train)


In [None]:
# Training accuracy score
bagged_tree.score(X_train, y_train)

In [None]:
# Test accuracy score
bagged_tree.score(X_test, y_test)

## Random Forest

In [None]:
# Instantiate and fit a RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, min_samples_split=50, n_jobs=-1, random_state=seed)
forest.fit(X_train, y_train)

In [None]:
# Training accuracy score
forest.score(X_train, y_train)

In [None]:
# Test accuracy score
forest.score(X_test, y_test)

In [None]:
predictions = forest.predict(X_test)
print(f"Train Accuracy: {accuracy_score(y_train, forest.predict(X_train))}")
print(f"Test Accuracy: {accuracy_score(y_test, predictions)}")

In [None]:
plot_feature_importances(forest, "Feature Importance for Random Forest")


# SMOTE

In [None]:
sm = SMOTE(random_state=seed)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
# Instantiate and fit a DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=seed) 
tree_clf.fit(X_train_res, y_train_res.ravel())

In [None]:
tree_cv_score = cross_val_score(tree_clf, X_train_res, y_train_res.ravel(), cv=3)
mean_tree_cv_score = np.mean(tree_cv_score)

print(f"Mean Cross Validation Score: {mean_tree_cv_score :.2%}")

## Regression

In [None]:
# logistic regression object 
lr = LogisticRegression() 
  
# train the model on train set 
lr.fit(X_train, y_train.ravel()) 
  
predictions = lr.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 
print(confusion_matrix(y_test, predictions))

In [None]:
lr1 = LogisticRegression() 
lr1.fit(X_train_res, y_train_res.ravel()) 
predictions = lr1.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 
print(confusion_matrix(y_test, predictions))

In [None]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train_res, y_train_res)
pred_rfc = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

In [None]:
mean_rf_cv_score = np.mean(cross_val_score(rfc, X_train_res, y_train_res, cv=3))

print(f"Mean Cross Validation Score for Random Forest Classifier: {mean_rf_cv_score :.2%}")

## Hyperparameter Tuning

In [None]:
rf_param_grid = {
    'n_estimators': [20, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 6]
}

rf_grid_search = GridSearchCV(rfc, rf_param_grid, cv=3, n_jobs=-1, verbose=1)
rf_grid_search.fit(X_train_res, y_train_res)

print(f"Training Accuracy: {rf_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

In [None]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=200, min_samples_leaf=3, min_samples_split=5, 
                             random_state=seed, n_jobs=-1)
rfc.fit(X_train_res, y_train_res)
pred_rfc = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

In [None]:
rf_grid_search.score(X_test, y_test)


In [None]:
plot_feature_importances(rfc, "Feature Importance for Random Forest")


## Adaboost Classifier

In [None]:
def display_acc_and_f1_score(true, preds, model_name):
    acc = accuracy_score(true, preds)
    f1 = f1_score(true, preds)
    print("Model: {}".format(model_name))
    print("Accuracy: {}".format(acc))
    print("F1-Score: {}".format(f1))

In [None]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(random_state=seed)

In [None]:
adaboost_clf.fit(X_train_res, y_train_res)


In [None]:
# AdaBoost model predictions
adaboost_train_preds = adaboost_clf.predict(X_train_res)
adaboost_test_preds = adaboost_clf.predict(X_test)

In [None]:
print("AdaBoost Training Metrics")
display_acc_and_f1_score(y_train_res, adaboost_train_preds, model_name='AdaBoost')
print("")

print("AdaBoost Testing Metrics")
display_acc_and_f1_score(y_test, adaboost_test_preds, model_name='AdaBoost')
print("")

print("AdaBoost Model")
print(classification_report(y_test, adaboost_test_preds))
print(confusion_matrix(y_test, adaboost_test_preds))

## Gradient Boosting Classifier

In [None]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=seed)

In [None]:
gbt_clf.fit(X_train_res, y_train_res)


In [None]:
# GradientBoosting model predictions
gbt_clf_train_preds = gbt_clf.predict(X_train_res)
gbt_clf_test_preds = gbt_clf.predict(X_test)

In [None]:
print("Gradient Boost Training Metrics")
display_acc_and_f1_score(y_train_res, gbt_clf_train_preds, model_name='Gradient Boosted Trees')
print("")
print("Gradient Boost Testing Metrics")
display_acc_and_f1_score(y_test, gbt_clf_test_preds, model_name='Gradient Boosted Trees')
print("")
print("Gradient Boost Model")
print(classification_report(y_test, gbt_clf_test_preds))
print(confusion_matrix(y_test, gbt_clf_test_preds))

In [None]:
plot_feature_importances(gbt_clf, "Feature Importance for Gradient Boosting")

## XGBoost