In [None]:
import pickle
import pandas as pd
import csv
import re
from pandas import read_csv
import datetime
import numpy as np
import xgboost as xgb

##For Analysis
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.formula.api import ols
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import shap

##
## ===> Visualization <===
##
import seaborn as sns
sns.set()
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

pd.set_option('display.width',1000)
pd.set_option('display.max_columns',300)
pd.set_option('display.max_rows',1000)

In [None]:
#Importing Cleaned Weather Data for Australia
weather_in_aus_cleaned = pd.read_csv('../input/weatheraus-cleaned/weatherAUS_cleaned.csv')

print(weather_in_aus_cleaned.info())
weather_in_aus_cleaned.head(100)

In [None]:
#Splitting into y and X variables
y = weather_in_aus_cleaned['raintomorrow_encoded'] 

X = weather_in_aus_cleaned
X = X.drop(['raintomorrow_encoded'], axis=1)

##Splitting data into training (80%) and testing (20%) sets (While keeping balanced)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, train_size=0.8, random_state=2021, stratify=y)

In [None]:
#One Hot Encoding 'Region' (Note: Training and Test Data must be separated and encoded separately as best practice)
X_train = pd.get_dummies(X_train, prefix_sep="_",columns=['Region'])

X_test = pd.get_dummies(X_test, prefix_sep="_",columns=['Region'])

In [None]:
#Removing Variables that will not be evaluated in original format in model trials.
X_train = X_train.filter(['MinTemp', 'MaxTemp', 'Evaporation', 'Sunshine', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm','raintoday_encoded',  'weather_reading_month', 'WindGustDir_encoded', 'WindDir9am_encoded', 'WindDir3pm_encoded', 'Region_New South Wales', 'Region_Northern Territory', 'Region_Queensland', 'Region_South Australia', 'Region_Tasmania', 'Region_Victoria', 'Region_Western Australia'])

X_test = X_test.filter(['MinTemp', 'MaxTemp', 'Evaporation', 'Sunshine', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm','raintoday_encoded',  'weather_reading_month', 'WindGustDir_encoded', 'WindDir9am_encoded', 'WindDir3pm_encoded', 'Region_New South Wales', 'Region_Northern Territory', 'Region_Queensland', 'Region_South Australia', 'Region_Tasmania', 'Region_Victoria', 'Region_Western Australia'])


In [None]:
#First Trial will be with Logistic Regression.
from sklearn.linear_model import LogisticRegression

In [None]:
##Applying Recursive Feature Elimination (RFE) for Logistic Regression. Recursive Feature Elimination indicates the features which are important with importance ranking. 
#Logit Estimator
logit_estimator = LogisticRegression(solver='liblinear',random_state=2021)
rfe_logit = RFE(estimator=logit_estimator, step=1)
rfe_logit.fit(X_train, y_train)

In [None]:
#Determining features of highest importance for the logistic regression model.
logit_feat = pd.DataFrame()
logit_feat['feature_name'] = X_train.columns
logit_feat['importance'] = rfe_logit.support_
print(rfe_logit.ranking_)
logit_feat

In [None]:
#Dropping columns found to have lowest importance to the logistic regression model.
X_train_reduced = X_train.filter(['MaxTemp','Sunshine','WindGustSpeed','Humidity3pm','Pressure9am','Pressure3pm','Cloud3pm','raintoday_encoded','Region_New South Wales','Region_Northern Territory','Region_Tasmania','Region_Victoria','Region_Western Australia'])
X_test_reduced = X_test.filter(['MaxTemp','Sunshine','WindGustSpeed','Humidity3pm','Pressure9am','Pressure3pm','Cloud3pm','raintoday_encoded','Region_New South Wales','Region_Northern Territory','Region_Tasmania','Region_Victoria','Region_Western Australia'])

In [None]:
#Building Logistic Regression Model with statsmodel for visibility to coefficients
logit_sm = sm.Logit(y_train, X_train_reduced)
result_logit_sm = logit_sm.fit()
result_logit_sm.summary()

In [None]:
#checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_constant_vif = sm.add_constant(X_train_reduced) #For evaluating VIF only.

vif= [variance_inflation_factor(X_train_constant_vif.values,i) for i in range(X_train_constant_vif.shape[1])]

pd.DataFrame({'vif': vif[1:]}, index=X_train_reduced.columns).T #Multicollinearity interpretted as high when VIF > 5, this only seems to happen in the case of Pressure here.

In [None]:
#Building Logistic Regression Model for Sklearn Prediction
model1 = LogisticRegression(solver='liblinear',random_state=2021).fit(X_train_reduced, y_train)

In [None]:
#Cross-Validation Accuracy Score
model1_cvs = cross_val_score(model1, X_train_reduced, y_train, cv=10)
model1_cvs.max()
model1_cvs.min()
model1_cvs.mean()

In [None]:
#Response Prediction
y_pred = model1.predict(X_test_reduced)

In [None]:
#Determining accuracy scores
accuracy_score(y_test,y_pred) #0.8516483516483516

In [None]:
#Creating classification report for logistic regression model.
print(classification_report(y_test, y_pred))

In [None]:
#Creating confusion matrix for logistic regression model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred)

In [None]:
#Determining AUC score for logistic regression model.
roc_auc_score(y_test, y_pred) #0.7378248808683444

In [None]:
#Determining F1 score for the logistic regresion model
f1_score(y_test, y_pred,average='binary') #0.6133949191685911; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
#There Pressure9am and Pressure3pm demonstrate high multicollinearity. Removing one and attending to re-fit Logit for a second trial.

In [None]:
#Removing Pressure9am to re-evaluate fit.
X_train_reduced2 = X_train.filter(['MaxTemp','Sunshine','WindGustSpeed','Humidity3pm','Pressure3pm','Cloud3pm','raintoday_encoded','Region_New South Wales','Region_Northern Territory','Region_Tasmania','Region_Victoria','Region_Western Australia'])
X_test_reduced2 = X_test.filter(['MaxTemp','Sunshine','WindGustSpeed','Humidity3pm','Pressure3pm','Cloud3pm','raintoday_encoded','Region_New South Wales','Region_Northern Territory','Region_Tasmania','Region_Victoria','Region_Western Australia'])

In [None]:
#Building Logistic Regression Model with statsmodel for visibility to coefficients
logit_sm2 = sm.Logit(y_train, X_train_reduced2)
result_logit_sm2 = logit_sm2.fit()
result_logit_sm2.summary()

In [None]:
#checking for multicollinearity a second time. Original concern is now resolved.
X_train_constant_vif2 = sm.add_constant(X_train_reduced2) #For evaluating VIF only.

vif2= [variance_inflation_factor(X_train_constant_vif2.values,i) for i in range(X_train_constant_vif2.shape[1])]

pd.DataFrame({'vif': vif2[1:]}, index=X_train_reduced2.columns).T #Multicollinearity interpretted as high when VIF > 10 or even 5.

In [None]:
#Building Logistic Regression Model 2 for Sklearn Prediction
model2= LogisticRegression(solver='liblinear',random_state=2021).fit(X_train_reduced2, y_train)

In [None]:
#Cross-Validation Accuracy Score
model2_cvs = cross_val_score(model2, X_train_reduced, y_train, cv=10)
model2_cvs.max()
model2_cvs.min()
model2_cvs.mean()

In [None]:
#Response Prediction
y_pred2 = model2.predict(X_test_reduced2)

In [None]:
#Determining accuracy scores
accuracy_score(y_test,y_pred2) #0.8456221198156681

In [None]:
#Creating classification report for logistic regression model.
print(classification_report(y_test, y_pred2))

In [None]:
#Creating confusion matrix for logistic regression model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred2)

In [None]:
#Determining AUC score for logistic regression model.
roc_auc_score(y_test, y_pred2) #0.7283298838376321

In [None]:
#Determining F1 score for the logistic regresion model
f1_score(y_test, y_pred2,average='binary') #0.5967592592592593; a poor F1 score, is close to 0.0. Best F1 score is close to 1.