In [None]:
import pickle
import pandas as pd
import csv
import re
from pandas import read_csv
import datetime
import numpy as np
import xgboost as xgb

##For Analysis
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.formula.api import ols
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import shap

##
## ===> Visualization <===
##
import seaborn as sns
sns.set()
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

pd.set_option('display.width',1000)
pd.set_option('display.max_columns',300)
pd.set_option('display.max_rows',1000)

In [None]:
#Importing Cleaned Weather Data for Australia
weather_in_aus_cleaned = pd.read_csv('../input/weatheraus-cleaned/weatherAUS_cleaned.csv')

print(weather_in_aus_cleaned.info())
weather_in_aus_cleaned.head(100)

In [None]:
#Splitting into y and X variables
y = weather_in_aus_cleaned.filter(['raintomorrow_encoded'])

X = weather_in_aus_cleaned
X = X.drop(['raintomorrow_encoded'], axis=1)

##Splitting data into training (80%) and testing (20%) sets (While keeping balanced)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, train_size=0.8, random_state=2021, stratify=y)

In [None]:
#One Hot Encoding 'Region' (Note: Training and Test Data must be separated and encoded separately as best practice)
X_train = pd.get_dummies(X_train, prefix_sep="_",columns=['Region'])

X_test = pd.get_dummies(X_test, prefix_sep="_",columns=['Region'])

In [None]:
#Displaying split of all rows across training and testing.
print('Training data row count:',X_train.shape[0])
print('Test data row count:',X_test.shape[0])

In [None]:
#Generating counts at Sunshine (Randomly selected) level to ensure that the distrubutions across all splits aren't deviating from each other.
training_dist = X_train['Sunshine'].value_counts()
test_dist = X_test['Sunshine'].value_counts()

In [None]:
training_dist.plot(kind='bar')
plt.xlabel('Sunshine')
plt.ylabel('Count')
plt.title('Distribution of Training Data')

In [None]:
test_dist.plot(kind='bar')
plt.xlabel('Sunshine')
plt.ylabel('Count')
plt.title('Sunshine')

In [None]:
X_train.columns

In [None]:
#Converting 'weather_reading_month' to categorical variable.
X_train['weather_reading_month'] = X_train['weather_reading_month'].astype('category')

In [None]:
#Summary Statistics for Categorical Variables
cat_variables = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday','RainTomorrow','weather_reading_month']
X_train[cat_variables].describe()

In [None]:
#Summary Statistics for Numerical Variables 
num_variables = ['MinTemp', 'MaxTemp', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
X_train[num_variables].describe()

##Generating Boxplot of MinTemp with Respect to 'RainTomorrow' 
plt.figure(figsize=(20,10))
cleaned=sns.boxplot(x='RainTomorrow', y='MinTemp', data=weather_in_aus_cleaned)
cleaned.set_xticklabels(cleaned.get_xticklabels(),rotation=90);

In [None]:
##RainTomorrow Subplots
f, axes = plt.subplots(3,3,figsize=(20,10))

sns.boxplot(x='RainTomorrow', y='MinTemp', data=weather_in_aus_cleaned, orient='v', ax=axes[0,0])
sns.boxplot(x='RainTomorrow', y='MaxTemp', data=weather_in_aus_cleaned, orient='v', ax=axes[0,1])
sns.boxplot(x='RainTomorrow', y='Evaporation', data=weather_in_aus_cleaned, orient='v', ax=axes[0,2])
sns.boxplot(x='RainTomorrow', y='WindGustSpeed', data=weather_in_aus_cleaned, orient='v', ax=axes[1,0])
sns.boxplot(x='RainTomorrow', y='Sunshine', data=weather_in_aus_cleaned, orient='v', ax=axes[1,1])
sns.boxplot(x='RainTomorrow', y='Humidity3pm', data=weather_in_aus_cleaned, orient='v', ax=axes[1,2])
sns.boxplot(x='RainTomorrow', y='Temp3pm', data=weather_in_aus_cleaned, orient='v', ax=axes[2,0])
sns.boxplot(x='RainTomorrow', y='Pressure3pm', data=weather_in_aus_cleaned, orient='v', ax=axes[2,1])
sns.boxplot(x='RainTomorrow', y='Cloud3pm', data=weather_in_aus_cleaned, orient='v', ax=axes[2,2])

In [None]:
##Generating Boxplot of MinTemp with Respect to 'RainTomorrow' 
plt.figure(figsize=(20,10)) 
cleaned=sns.boxplot(x='RainTomorrow', y='MinTemp', data=weather_in_aus_cleaned) 
cleaned.set_xticklabels(cleaned.get_xticklabels(),rotation=90);

In [None]:
##Generating Boxplot of MaxTemp with Respect to 'RainTomorrow' 
plt.figure(figsize=(20,10))
cleaned=sns.boxplot(x='RainTomorrow', y='MaxTemp', data=weather_in_aus_cleaned)
cleaned.set_xticklabels(cleaned.get_xticklabels(),rotation=90);

In [None]:
##Generating Boxplot of Evaporation with Respect to 'RainTomorrow' 
plt.figure(figsize=(20,10))
cleaned=sns.boxplot(x='RainTomorrow', y='Evaporation', data=weather_in_aus_cleaned)
cleaned.set_xticklabels(cleaned.get_xticklabels(),rotation=90);

In [None]:
##Generating Boxplot of WindGustSpeed with Respect to 'RainTomorrow' 
plt.figure(figsize=(20,10))
cleaned=sns.boxplot(x='RainTomorrow', y='WindGustSpeed', data=weather_in_aus_cleaned)
cleaned.set_xticklabels(cleaned.get_xticklabels(),rotation=90);

In [None]:
##Generating Boxplot of Sunshine with Respect to 'RainTomorrow' 
plt.figure(figsize=(20,10))
cleaned=sns.boxplot(x='RainTomorrow', y='Sunshine', data=weather_in_aus_cleaned)
cleaned.set_xticklabels(cleaned.get_xticklabels(),rotation=90);

In [None]:
##Histograms of All Numerical Variables of Interest 
X_train[num_variables].hist(bins=10, figsize=(20,15), layout=(3,5))

In [None]:
##RainToday Subplots
f, axes = plt.subplots(3,3,figsize=(20,10))

sns.boxplot(x='RainToday', y='MinTemp', data=weather_in_aus_cleaned, orient='v', ax=axes[0,0])
sns.boxplot(x='RainToday', y='MaxTemp', data=weather_in_aus_cleaned, orient='v', ax=axes[0,1])
sns.boxplot(x='RainToday', y='Evaporation', data=weather_in_aus_cleaned, orient='v', ax=axes[0,2])
sns.boxplot(x='RainToday', y='WindGustSpeed', data=weather_in_aus_cleaned, orient='v', ax=axes[1,0])
sns.boxplot(x='RainToday', y='Sunshine', data=weather_in_aus_cleaned, orient='v', ax=axes[1,1])
sns.boxplot(x='RainToday', y='Humidity9am', data=weather_in_aus_cleaned, orient='v', ax=axes[1,2])
sns.boxplot(x='RainToday', y='Temp9am', data=weather_in_aus_cleaned, orient='v', ax=axes[2,0])
sns.boxplot(x='RainToday', y='Pressure9am', data=weather_in_aus_cleaned, orient='v', ax=axes[2,1])
sns.boxplot(x='RainToday', y='Cloud9am', data=weather_in_aus_cleaned, orient='v', ax=axes[2,2])

In [None]:
#Checking to ensure no nulls
X_train.isnull().sum()

In [None]:
##Building a quick check Correlation Matrix  for Numerical Data Training Data
corr_combined = X_train[num_variables]
act_corr = corr_combined.corr()
matrix = np.tril(act_corr)
f, ax = plt.subplots(figsize=(15,12))
sns.heatmap(act_corr, vmax=0.8, annot=True, mask=matrix)

In [None]:
#Counting the numbers ofc 0's and 1's
y_train['raintomorrow_encoded'].value_counts()

In [None]:
#Counting the numbers of 0's and 1's
X_train['raintoday_encoded'].value_counts()

In [None]:
#Counting Number of Days with rain per given year
rainyday_count_peryear = weather_in_aus_cleaned.groupby(['weather_reading_month'], as_index=False)['raintoday_encoded'].sum()

rainyday_count_peryear 

In [None]:
#Plotting 'rainyday_count_permonth'
sns.set_theme(style="whitegrid")

g = sns.catplot(data=rainyday_count_peryear, kind="bar", x="weather_reading_month", y="raintoday_encoded", ci="sd", palette="dark", alpha=.6, height=6)

g.set_axis_labels("Observation Month", "raintoday_encoded count")

In [None]:
#Counting Number of Days Rain per Region
rainyday_count_per_region = weather_in_aus_cleaned.groupby(['Region'], as_index=False)['raintoday_encoded'].sum()

rainyday_count_per_region

In [None]:
#Creating Pie Chart of Sum of RainyDays recorded across All Disciplines from 2007-2017
pie,ax = plt.subplots(figsize=[10,6])

labels = rainyday_count_per_region['Region']


plt.pie(x=rainyday_count_per_region['raintoday_encoded'], autopct="%.1f%%", explode=[0.05]*7, labels=labels, pctdistance=0.5)

plt.title("Rainy Days recorded per Australian Region (2007-2017)", fontsize=14)

ax.axis('equal') #Balancing Aspect Ratio
plt.tight_layout()
plt.show()

#pie.savefig("Rainy Days recorded per Australian Region (2007-2017).png")

#Counting days of present day rain AND raintomorrow
combined_traindata = pd.concat([X_train, y_train],axis=1)

combined_traindata.loc[(combined_traindata["raintoday_encoded"]==1) & (combined_traindata["raintomorrow_encoded"]==1)]