## Import dependencies and read csv

In [6]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier

In [41]:
#credentials for database

#from sqlalchemy import create_engine
#from config_db import db_password
# Read ml_input table into dataframe
db_string = f”postgresql://postgres:{db_password}@127.0.0.1:5432/stackoverflow”
engine = create_engine(db_string)
# table named ‘ml_input’ will be returned as a dataframe.
ml_input_df = pd.read_sql_table(‘ml_input’, engine)
ml_input_df

SyntaxError: invalid character in identifier (<ipython-input-41-c4bce657f015>, line 6)

In [7]:
#csv for testing our ML models

file_path='../Resources/ML_Input_Jan2021.csv'
df=pd.read_csv(file_path)

In [8]:
df.head()

Unnamed: 0,q_id,accepted_answer_id,q_score,q_score_tier,q_view_count,q_view_count_bin,q_title_char_count,q_title_char_count_bin,q_title_word_count,q_title_word_count_bin,q_body_word_count,q_body_len_bin,q_tags_count,q_day,q_hour,q_hour_min,accepted_answer_duration
0,65526420,65526457,2,Positive Score (>0),62,50-16000,72,Medium (50-100),13,Medium (10-20),116,100-250,3,Friday,0,00:05,0.122066
1,65526423,65526533,2,Positive Score (>0),48,40-50,48,Short (0 - 50),8,Short (0 - 10),58,50-100,2,Friday,0,00:06,0.475172
2,65526490,65526541,2,Positive Score (>0),35,30-40,81,Medium (50-100),13,Medium (10-20),117,100-250,2,Friday,0,00:20,0.287423
3,65526419,65526554,3,Positive Score (>0),351,50-16000,76,Medium (50-100),9,Short (0 - 10),50,<50,4,Friday,0,00:05,0.575997
4,65526523,65526577,2,Positive Score (>0),117,50-16000,82,Medium (50-100),14,Medium (10-20),305,250-500,3,Friday,0,00:30,0.253412


## Data Preprocessing

- Dropped null values and dropped uncessary columns
- Binned data in accepted_answer_duration

In [9]:
#drop identification columns also q_hour_min and q_body_len_bin columns as they are redundant to other columns

filter_df=df[['q_title_word_count','q_title_word_count_bin','q_body_word_count','q_body_len_bin','q_tags_count','q_day','q_hour','accepted_answer_duration']]
filter_df.head()

Unnamed: 0,q_title_word_count,q_title_word_count_bin,q_body_word_count,q_body_len_bin,q_tags_count,q_day,q_hour,accepted_answer_duration
0,13,Medium (10-20),116,100-250,3,Friday,0,0.122066
1,8,Short (0 - 10),58,50-100,2,Friday,0,0.475172
2,13,Medium (10-20),117,100-250,2,Friday,0,0.287423
3,9,Short (0 - 10),50,<50,4,Friday,0,0.575997
4,14,Medium (10-20),305,250-500,3,Friday,0,0.253412


In [14]:
filter_df=filter_df.dropna()

In [15]:
#bin accepted_answer_duration

answer_bins = [0, 24, 6000]
answer_bins_group_names = ["<1D", ">1D"]

# Categorize score based on the bins.
filter_df['accepted_answer_duration_bin'] = pd.cut(filter_df['accepted_answer_duration'], answer_bins, labels=answer_bins_group_names)

In [16]:
filter_df.head()

Unnamed: 0,q_title_word_count,q_title_word_count_bin,q_body_word_count,q_body_len_bin,q_tags_count,q_day,q_hour,accepted_answer_duration,accepted_answer_duration_bin
0,13,Medium (10-20),116,100-250,3,Friday,0,0.122066,<1D
1,8,Short (0 - 10),58,50-100,2,Friday,0,0.475172,<1D
2,13,Medium (10-20),117,100-250,2,Friday,0,0.287423,<1D
3,9,Short (0 - 10),50,<50,4,Friday,0,0.575997,<1D
4,14,Medium (10-20),305,250-500,3,Friday,0,0.253412,<1D


In [17]:
filter_df.dtypes

q_title_word_count                 int64
q_title_word_count_bin            object
q_body_word_count                  int64
q_body_len_bin                    object
q_tags_count                       int64
q_day                             object
q_hour                             int64
accepted_answer_duration         float64
accepted_answer_duration_bin    category
dtype: object

## Create features and encode our features using pd.get_dummies

## Run Models Using Binned Features

In [20]:
# Create our features
X_binned = filter_df.drop(['q_title_word_count','q_body_word_count','accepted_answer_duration','accepted_answer_duration_bin'], axis=1)
X_binned = pd.get_dummies(X_binned)

# Create our target

y = filter_df["accepted_answer_duration_bin"]

X_binned.head()

Unnamed: 0,q_tags_count,q_hour,q_title_word_count_bin_Long (20-30),q_title_word_count_bin_Medium (10-20),q_title_word_count_bin_Short (0 - 10),q_title_word_count_bin_XL (30+),q_body_len_bin_100-250,q_body_len_bin_250-500,q_body_len_bin_50-100,q_body_len_bin_500-10000,q_body_len_bin_<50,q_day_Friday,q_day_Monday,q_day_Saturday,q_day_Sunday,q_day_Thursday,q_day_Tuesday,q_day_Wednesday
0,3,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0
2,2,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0
4,3,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0


In [21]:
# Check the balance of our target values
y.value_counts()

<1D    386497
>1D     61007
Name: accepted_answer_duration_bin, dtype: int64

## Split data to training and testing sets

In [22]:
from sklearn.model_selection import train_test_split
X_binned_train, X_binned_test, y_train, y_test = train_test_split(X_binned,y,random_state=1,stratify=y)

# Random Forest Classifier 
## Fit model

In [23]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc_bin = BalancedRandomForestClassifier(n_estimators=100,random_state=1)
rf_bin = brfc_bin.fit(X_binned_train,y_train)

## Calculate Accuracy

In [24]:
# Calculated the balanced accuracy score
y_pred=rf_bin.predict(X_binned_test)
ba_balanced_forest_binned=balanced_accuracy_score(y_test,y_pred)
ba_balanced_forest_binned

0.5529495476166002

## Display Confusion Matrix


In [26]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)

cm_rf_binned_df=pd.DataFrame(cm,
                  index=["Actual Less Than 24 Hours", "Actual Greater Than 24 Hours"],
                  columns=["Predicted Less Than 24 Hours", "Predicted Greater Than 24 Hours"])
cm_rf_binned_df

Unnamed: 0,Predicted Less Than 24 Hours,Predicted Greater Than 24 Hours
Actual Less Than 24 Hours,52583,44041
Actual Greater Than 24 Hours,6685,8567


## Print additional scores for analysis: precision, recall, and f1
## See summary in comparison section below

In [54]:
#imbalanced classification report
icr_balanced_forest_binned=classification_report_imbalanced(y_test,y_pred)

## Feature Importance

In [30]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_bin.feature_importances_, X_binned.columns), reverse=True)

[(0.6448099311012391, 'q_hour'),
 (0.15952956319103145, 'q_tags_count'),
 (0.030754420450241828, 'q_body_len_bin_500-10000'),
 (0.02255523109476319, 'q_body_len_bin_250-500'),
 (0.017754638309173856, 'q_day_Friday'),
 (0.016938711743055102, 'q_body_len_bin_50-100'),
 (0.012897456760070442, 'q_title_word_count_bin_Short (0 - 10)'),
 (0.012591943934120229, 'q_title_word_count_bin_Medium (10-20)'),
 (0.009981839334289895, 'q_day_Wednesday'),
 (0.009796000398893046, 'q_body_len_bin_100-250'),
 (0.00965609382440849, 'q_day_Thursday'),
 (0.00952906359147217, 'q_day_Monday'),
 (0.009190572168048906, 'q_day_Saturday'),
 (0.00918473984574853, 'q_day_Tuesday'),
 (0.008944454467710064, 'q_title_word_count_bin_Long (20-30)'),
 (0.008418534047194514, 'q_day_Sunday'),
 (0.007151594451761099, 'q_body_len_bin_<50'),
 (0.0003152112867781374, 'q_title_word_count_bin_XL (30+)')]

# Easy Ensemble Classifier
## Fit Model

In [33]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

ensemble_binned=EasyEnsembleClassifier(n_estimators=100,random_state=1)

eec_binned = ensemble_binned.fit(X_binned_train, y_train)

## Calculate Accuracy

In [35]:
# Calculated the balanced accuracy score
y_pred=eec_binned.predict(X_binned_test)
ba_easy_ensemble_binned=balanced_accuracy_score(y_test,y_pred)
ba_easy_ensemble_binned

0.5718322967326592

## Display Confusion Matrix

In [36]:
# Display the confusion matrix
cm=confusion_matrix(y_test, y_pred)
 
cm_ee_binned_df=pd.DataFrame(cm,
                  index=["Actual Less Than 24 Hours", "Actual Greater Than 24 Hours"],
                  columns=["Predicted Less Than 24 Hours", "Predicted Greater Than 24 Hours"])

cm_ee_binned_df

Unnamed: 0,Predicted Less Than 24 Hours,Predicted Greater Than 24 Hours
Actual Less Than 24 Hours,58348,38276
Actual Greater Than 24 Hours,7019,8233


## Print additional scores for analysis: precision, recall, and f1
## See comparison section below for summary

In [38]:
#imbalanced classification report
icr_easy_ensemble_binned=classification_report_imbalanced(y_test,y_pred)

# Logistic Regression 

In [40]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

In [43]:
from sklearn.linear_model import LogisticRegression
classifier_binned = LogisticRegression()
classifier_binned

LogisticRegression()

In [44]:
classifier_binned.fit(X_binned_train, y_train)

LogisticRegression()

In [46]:
print(f"Training Data Score: {classifier_binned.score(X_binned_train, y_train)}")
print(f"Testing Data Score: {classifier_binned.score(X_binned_test, y_test)}")

Training Data Score: 0.8636734718199912
Testing Data Score: 0.8636704923307948


In [47]:
print(f'Actual:\t\t{list(y_test[:10])}')
print("Predicted:\t{}".format(list(classifier_binned.predict(X_binned_test[:10]))))

Actual:		['<1D', '>1D', '>1D', '<1D', '<1D', '<1D', '>1D', '<1D', '<1D', '<1D']
Predicted:	['<1D', '<1D', '<1D', '<1D', '<1D', '<1D', '<1D', '<1D', '<1D', '<1D']


## Calculate Accuracy

In [48]:
# Calculated the balanced accuracy score
y_pred=classifier_binned.predict(X_binned_test)
ba_logistic_regression_binned=balanced_accuracy_score(y_test,y_pred)
ba_logistic_regression_binned

0.5

## Display Confusion Matrix

In [50]:
# Display the confusion matrix
cm=confusion_matrix(y_test, y_pred)
 
cm_logreg_binned_df=pd.DataFrame(cm,
                  index=["Actual Less Than 24 Hours", "Actual Greater Than 24 Hours"],
                  columns=["Predicted Less Than 24 Hours", "Predicted Greater Than 24 Hours"])

cm_logreg_binned_df

Unnamed: 0,Predicted Less Than 24 Hours,Predicted Greater Than 24 Hours
Actual Less Than 24 Hours,96624,0
Actual Greater Than 24 Hours,15252,0


## Print additional scores for analysis: precision, recall, and f1

In [52]:
#imbalanced classification report
icr_logistic_regression_binned=classification_report_imbalanced(y_test,y_pred)

## Comparison Between the Models using Binned Features

In [56]:
#Summary of findings

print(f'For the Balanced Random Forest Classifier algortihm, using binned features the balanced accuracy score is {ba_balanced_forest_binned}' 
      f'\n\nand the imbalanced classifcation report is:\n\n{icr_balanced_forest_binned}')

print(f'For the Easy Ensemble AdaBoost Classifier algortihm, using binned features the balanced accuracy score is {ba_easy_ensemble_binned}' 
      f'\n\nand the imbalanced classifcation report is:\n\n{icr_easy_ensemble_binned}')

print(f'For the Logistic Regression algortihm, using binned features the balanced accuracy score is {ba_logistic_regression_binned}' 
      f'\n\nand the imbalanced classifcation report is:\n\n{icr_logistic_regression_binned}')

For the Balanced Random Forest Classifier algortihm, using binned features the balanced accuracy score is 0.5529495476166002

and the imbalanced classifcation report is:

                   pre       rec       spe        f1       geo       iba       sup

        <1D       0.86      1.00      0.00      0.93      0.00      0.00     96624
        >1D       0.00      0.00      1.00      0.00      0.00      0.00     15252

avg / total       0.75      0.86      0.14      0.80      0.00      0.00    111876

For the Easy Ensemble AdaBoost Classifier algortihm, using binned features the balanced accuracy score is 0.5718322967326592

and the imbalanced classifcation report is:

                   pre       rec       spe        f1       geo       iba       sup

        <1D       0.89      0.60      0.54      0.72      0.57      0.33     96624
        >1D       0.18      0.54      0.60      0.27      0.57      0.32     15252

avg / total       0.80      0.60      0.55      0.66      0.57      0.33

## Run Models Using Non-Binned Features

In [61]:
# Create our features
X = filter_df.drop(['accepted_answer_duration','accepted_answer_duration_bin','q_body_len_bin','q_title_word_count_bin'], axis=1)
X = pd.get_dummies(X)

# Create our target

y = filter_df["accepted_answer_duration_bin"]

X.head()

Unnamed: 0,q_title_word_count,q_body_word_count,q_tags_count,q_hour,q_day_Friday,q_day_Monday,q_day_Saturday,q_day_Sunday,q_day_Thursday,q_day_Tuesday,q_day_Wednesday
0,13,116,3,0,1,0,0,0,0,0,0
1,8,58,2,0,1,0,0,0,0,0,0
2,13,117,2,0,1,0,0,0,0,0,0
3,9,50,4,0,1,0,0,0,0,0,0
4,14,305,3,0,1,0,0,0,0,0,0


In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)

# Random Forest Classifier
## Fit model

In [64]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=100,random_state=1)
rf = brfc.fit(X_train,y_train)

## Calculate Accuracy

In [65]:
# Calculated the balanced accuracy score
y_pred=rf.predict(X_test)
ba_balanced_forest=balanced_accuracy_score(y_test,y_pred)
ba_balanced_forest

0.5416843322951042

## Display Confusion Matrix

In [66]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)

cm_rf_df=pd.DataFrame(cm,
                  index=["Actual Less Than 24 Hours", "Actual Greater Than 24 Hours"],
                  columns=["Predicted Less Than 24 Hours", "Predicted Greater Than 24 Hours"])
cm_rf_df

Unnamed: 0,Predicted Less Than 24 Hours,Predicted Greater Than 24 Hours
Actual Less Than 24 Hours,53789,42835
Actual Greater Than 24 Hours,7219,8033


## Print additional scores for analysis: precision, recall, and f1

In [58]:
#imbalanced classification report
icr_balanced_forest=classification_report_imbalanced(y_test,y_pred)

## Feature Importance

In [70]:
# List the features sorted in descending order by feature importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.5179621297158278, 'q_body_word_count'),
 (0.21197150745482093, 'q_title_word_count'),
 (0.21058713070897694, 'q_hour'),
 (0.03322278365703065, 'q_tags_count'),
 (0.0041018542699754875, 'q_day_Thursday'),
 (0.004037448550990621, 'q_day_Monday'),
 (0.003941187315642853, 'q_day_Wednesday'),
 (0.003926222518898671, 'q_day_Friday'),
 (0.003717719825143495, 'q_day_Tuesday'),
 (0.0032999633360740175, 'q_day_Saturday'),
 (0.0032320526466187134, 'q_day_Sunday')]

# Easy Ensemble Classifier
## Fit Model

In [71]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

ensemble=EasyEnsembleClassifier(n_estimators=100,random_state=1)

eec = ensemble.fit(X_train, y_train)

## Calculate Accuracy

In [None]:
# Calculated the balanced accuracy score
y_pred=eec.predict(X_test)
ba_easy_ensemble=balanced_accuracy_score(y_test,y_pred)
ba_easy_ensemble

## Display Confusion Matrix

In [None]:
# Display the confusion matrix
cm=confusion_matrix(y_test, y_pred)
 
cm_df=pd.DataFrame(cm,
                  index=["Actual <1D", "Actual >1D"],
                  columns=["Predicted <1D", "Predicted >1D"])

cm_df

## Print additional scores for analysis: precision, recall, and f1

In [None]:
#imbalanced classification report
icr_easy_ensemble=classification_report_imbalanced(y_test,y_pred)

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
print(f'Actual:\t\t{list(y_test[:10])}')
print("Predicted:\t{}".format(list(classifier.predict(X_test[:10]))))

## Calculate Accuracy

In [61]:
# Calculated the balanced accuracy score
y_pred=classifier.predict(X_test)
ba_logistic_regression=balanced_accuracy_score(y_test,y_pred)
ba_logistic_regression

ValueError: X has 21 features per sample; expecting 25

## Display Confusion Matrix

In [62]:
# Display the confusion matrix
cm=confusion_matrix(y_test, y_pred)
 
cm_df=pd.DataFrame(cm,
                  index=["Actual <1D", "Actual >1D"],
                  columns=["Predicted <1D", "Predicted >1D"])

cm_df

Unnamed: 0,Predicted <1D,Predicted >1D
Actual <1D,58915,37695
Actual >1D,5982,9263


In [63]:
#imbalanced classification report
icr_logistic_regression=classification_report_imbalanced(y_test,y_pred)

In [67]:
#Summary of findings

print(f'For the Balanced Random Forest Classifier algortihm, the balanced accuracy score is {ba_balanced_forest}' 
      f'\n\nand the imbalanced classifcation report is:\n\n{icr_balanced_forest}')

print(f'For the Easy Ensemble AdaBoost Classifier algortihm, using binned features the balanced accuracy score is {ba_easy_ensemble_binned}' 
      f'\n\nand the imbalanced classifcation report is:\n\n{icr_easy_ensemble_binned}')

print(f'For the Logistic Regression algortihm, the balanced accuracy score is {ba_logistic_regression}' 
      f'\n\nand the imbalanced classifcation report is:\n\n{icr_logistic_regression}')

NameError: name 'icr_balanced_forest' is not defined