In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df = data.copy()
df.head()

## Descriptive Analysis

In [None]:
df.describe().T

In [None]:
df.info()

##### Insight 
- Maximum variables show null values in the data
- Anomalies as such cannot be detected.
- Data is not normally distributed.
- The variables have proper datatypes.

### Data Cleaning 

In [None]:
((df.isnull().sum())/len(df))*100

#### Columns having more than 30% Null values w.r.t total data-set will be dropped.
- The following columns show the following characteristic:
    - Evaporation
    - Sunshine
    - Cloud9am
    - Cloud3pm
    

In [None]:
df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis = 1)
df.info()

In [None]:
df_num = df.select_dtypes('float64')
df_cat = df.select_dtypes('object')

In [None]:
df_num.info()

In [None]:
numeric_null = df_num.isnull().sum()
numeric_null

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')

df_num = pd.DataFrame(imputer.fit_transform(df_num), columns = df_num.columns)

In [None]:
cat_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
df_cat = pd.DataFrame(cat_imputer.fit_transform(df_cat), columns = df_cat.columns)

In [None]:
df_new = pd.concat([df_cat, df_num], axis = 1, join = 'inner')
df_new

In [None]:
df_new.info()

## EDA

In [None]:
df_new['Date'] = pd.to_datetime(df['Date'])
df_new.info()

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df, x = 'WindGustDir', y = 'Rainfall', ci = False)
plt.show()

## Insight:
- Highest amount of rainfall can be observed towards the Southern direction of Australia.
- Almost all Northern area of where the wind direction leads rainfall is observed to be lowest
- Through this we can infer that the rain follows highly towards the southern western direction.

#### Does the humidity levels in the day impact the rain for the same day & does the humidity level of a later time of the same day impact the next day rains? 

In [None]:
fig, axes = plt.subplots(nrows=1,ncols = 2, figsize = (15,8))
sns.boxplot(data = df_new,x = 'RainToday', y = 'Humidity9am', ax = axes[0])
sns.boxplot(data = df_new,x = 'RainTomorrow', y = 'Humidity3pm', ax = axes[1])
plt.show()

#### Insight:
- On an average, we can see that if the Humidity levels are between 50 - 80 in the morning there is a less chance of rain the same day.
- Similarly, if there is high humidity, we can expect the rain to pour within the same day.
- On the contrary, if the humidity level on a later time in the same day is around 35 - 60, there is a less chance of rain the next day.
- But, if there is high humidity, there is a high chance of rain

#### How does Pressure, windspeed and Temperature recorded in the day impact Rainfall in the same day?

In [None]:
plt.figure(figsize = (20,8))
plt.subplot(1,3,1)
sns.boxplot(data = df_new, x = 'RainToday', y = 'Pressure9am',color = 'lightgreen')
plt.subplot(1,3,2)
sns.boxplot(data = df_new, x = 'RainToday', y = 'Temp9am',color = 'lightblue')
plt.subplot(1,3,3)
sns.boxplot(data = df_new, x = 'RainToday', y = 'WindSpeed9am', color = 'lightgrey')
plt.show()

#### Insight:
- For Rains to arrive on the same day, through these plots the three variables do not seem to vary much with respect to their latter.
- Almost same conditions are required by each aspect for the rains to come or not come.
- However, if looked into them closely enough, the pressure and temperature in the morning should be lower than the days when rains do not come.
- The Wind speed should be relatively higher than non rainfall days.

#### How does Pressure, Temperature and Windspeed recorded at a later time of the day impact the rains for the next day?

In [None]:
plt.figure(figsize = (20,8))
plt.subplot(1,3,1)
sns.boxplot(data = df_new, x = 'RainTomorrow', y = 'Pressure3pm',color = 'lightgreen')
plt.subplot(1,3,2)
sns.boxplot(data = df_new, x = 'RainTomorrow', y = 'Temp3pm',color = 'lightblue')
plt.subplot(1,3,3)
sns.boxplot(data = df_new, x = 'RainTomorrow', y = 'WindSpeed3pm', color = 'lightgrey')
plt.show()

#### Insight:
- Similar to the previous observation, the Pressure and temperature should be relatively lower than no rainfall days, and the windspeed should be higher than usual

## Bi-Variate Analysis

In [None]:
plt.figure(figsize = (15,8))
sns.heatmap(data = df_new.corr(), cmap = 'coolwarm', annot = True, fmt = '.2g')
plt.show()

## Data PreProcessing

##### Checking to See if PCA is necessary

In [None]:
df_num = df_new.select_dtypes('float64')
df_cat = df_cat.select_dtypes('object')

In [None]:
## Since Scaling is necessary for the same, the numeric columns will be scaled using the StandardScaler.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_std = scaler.fit_transform(df_num)
data_std

In [None]:
pip install factor_analyzer

In [None]:
from factor_analyzer import calculate_bartlett_sphericity, calculate_kmo

In [None]:
chi_sq, pvalue = calculate_bartlett_sphericity(data_std)
kmo_all, kmo_model = calculate_kmo(data_std)
print(pvalue, kmo_model)

In [None]:
pvalue < 0.05

### Insight:
- Since the pvalue is less than 0.05, this confirms atleast one pair of variables are correlated.
- MSC value of the kmo_test reveals that there is a need for dimension reduction

## PCA

In [None]:
## Creating covariance Matrix:
cov_matrix = np.cov(data_std.T)
cov_matrix

In [None]:
## Eigen Value and Vector extraction
eig_vals, eig_vec = np.linalg.eig(cov_matrix)
print(eig_vals)
print('')
print(eig_vec)

In [None]:
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse = True)]
cum_var_exp = np.cumsum(var_exp)
cum_var_exp

In [None]:
## Scree Plot:

per_var = np.round(cum_var_exp, decimals = 1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

plt.figure(figsize = (12,6))
sns.lineplot(y = var_exp, x = range(1,len(var_exp) + 1),marker = 'o')
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

#### Insight :
- Based upon the Scree plot, we can stop at 5 principal Components

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 5, random_state = 123)

In [None]:
df_pca = pca.fit_transform(data_std)
df_pca.transpose()

In [None]:
pca.explained_variance_ratio_

In [None]:
data_std = pd.DataFrame(data_std, columns = list(df_num))
data_std

In [None]:
comp = pca.components_

In [None]:
data_loading = pd.DataFrame(comp, columns = list(data_std))
data_loading

In [None]:
from matplotlib.patches import Rectangle
fig,ax = plt.subplots(figsize = (22,10), facecolor='w', edgecolor = 'k')
ax = sns.heatmap(data_loading, annot = True, fmt = '.2g', cbar = False, ax = ax, vmax = 1.0, vmin = 0.0,cmap = 'Blues',
                yticklabels = ['PC0', 'PC1', 'PC2', 'PC3', 'PC4'])

column_max = data_loading.abs().idxmax(axis = 0)

for col, variable in enumerate(data_loading.columns):
    position = data_loading.index.get_loc(column_max[variable])
    ax.add_patch(Rectangle((col, position),1,1,fill = False, edgecolor = 'red', lw = 3))
plt.show()

#### Insight:
- Extractable rows are as follows:
    - Temp9am
    - WindGustSpeed
    - Humidity3pm
    - Pressure3pm
    - Rainfall

In [None]:
df_pca = pd.DataFrame(df_pca, columns = ['pc_Temp9am', 'pc_WindGustSpeed', 'pc_Humidity3pm', 'pc_Pressure3pm', 'pc_Rainfall'])
df_pca

In [None]:
df_cat

In [None]:
df_cat=df_cat.drop('Date', axis = 1)

##### Label Encoding the variables with higher than 2 classes

In [None]:
df_cat['Location'] = df['Location'].astype('category')
df_cat['WindGustDir'] = df_cat['WindGustDir'].astype('category')
df_cat['WindDir9am'] = df_cat['WindDir9am'].astype('category')
df_cat['WindDir3pm'] = df_cat['WindDir3pm'].astype('category')

In [None]:
df_cat.info()

In [None]:
df_cat['Location'] = df_cat['Location'].cat.codes
df_cat['WindGustDir'] = df_cat['WindGustDir'].cat.codes
df_cat['WindDir9am'] = df_cat['WindDir9am'].cat.codes
df_cat['WindDir3pm'] = df_cat['WindDir3pm'].cat.codes

In [None]:
df_cat.head()

In [None]:
df_cat['RainToday'] = pd.get_dummies(df_cat['RainToday'], drop_first = True)
df_cat['RainTomorrow'] = pd.get_dummies(df_cat['RainTomorrow'], drop_first = True)

In [None]:
df_cat.head()

In [None]:
df_ml = pd.concat([df_pca, df_cat], axis = 1, join = 'inner')
df_ml

### Data Preprocessing for ML

In [None]:
X = df_ml.drop('RainTomorrow', axis = 1)
y = df_ml.pop('RainTomorrow')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, train_label, test_label = train_test_split(X,y,test_size = 0.30, random_state = 100)


#### Model Evaluators

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver = 'liblinear')
logreg.fit(X_train, train_label)

In [None]:
ytrain_pred_log = logreg.predict(X_train)
ytest_pred_log = logreg.predict(X_test)

In [None]:
## Evaluating the model
## For training set
print(classification_report(train_label, ytrain_pred_log))
print('')
print('')
print(confusion_matrix(train_label, ytrain_pred_log))

In [None]:
## For Testing Set
print(classification_report(test_label, ytest_pred_log))
logreg_report = classification_report(test_label, ytest_pred_log)
print('')
print('')
print(confusion_matrix(test_label, ytest_pred_log))
logreg_cm = confusion_matrix(test_label, ytest_pred_log)

#### ROC - AUC score and plot

In [None]:
prob_train = logreg.predict_proba(X_train)
prob_test = logreg.predict_proba(X_test)

In [None]:
prob_train = prob_train[:,1]
prob_test = prob_test[:,1]

In [None]:
auc_train = roc_auc_score(train_label, prob_train)
auc_test = roc_auc_score(test_label, prob_test)
print(auc_train, auc_test) 

In [None]:
## For training data
fpr, tpr, threshold = roc_curve(train_label, prob_train)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr, linestyle = 'dotted')
plt.show()

In [None]:
## For testing data
fpr, tpr, threshold = roc_curve(test_label, prob_test)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr, linestyle = 'dotted')
plt.show()

## Decsion Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
from sklearn import tree

In [None]:
dtc = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, min_samples_leaf = 100, min_samples_split = 300)
dtc.fit(X_train, train_label)
train_char_label = ['Yes', 'No']
plt.figure(figsize = (20,20))
tree.plot_tree(dtc, feature_names = list(X_train), class_names = train_char_label);

In [None]:
ytrain_predict_dtc = dtc.predict(X_train)
ytest_predict_dtc = dtc.predict(X_test)

### Evaluating the model

In [None]:
## For Training data
print(classification_report(train_label, ytrain_predict_dtc))
print('')
print('')
print(confusion_matrix(train_label, ytrain_predict_dtc))

In [None]:
## For Testing Data
print(classification_report(test_label, ytest_predict_dtc))
dtc_report = classification_report(test_label, ytest_predict_dtc)
print('')
print('')
print(confusion_matrix(test_label, ytest_predict_dtc))
dtc_cm = confusion_matrix(test_label, ytest_predict_dtc)

### ROC-AUC Curve and Values

In [None]:
prob_train = dtc.predict_proba(X_train)
prob_test = dtc.predict_proba(X_test)

In [None]:
prob_train = prob_train[:,1]
prob_test = prob_test[:,1]

In [None]:
auc_train = roc_auc_score(train_label, prob_train)
auc_test = roc_auc_score(test_label, prob_test)
print(auc_train, auc_test)

In [None]:
## For training data
fpr, tpr, threshold = roc_curve(train_label, prob_train)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr, linestyle = 'dotted')
plt.show()

In [None]:
## For testing data
fpr, tpr, threshold = roc_curve(test_label, prob_test)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr, linestyle = 'dotted')
plt.show()

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

#### First Iteration

In [None]:
rfc = RandomForestClassifier(n_estimators = 500, oob_score = True)
rfc.fit(X_train, train_label)
rfc.oob_score_

#### Second Iteration

In [None]:
rfc1 = RandomForestClassifier(n_estimators = 500, oob_score = True,
                             min_samples_leaf = 30,
                             min_samples_split = 90,
                             max_features = 6)
rfc1.fit(X_train, train_label)
rfc1.oob_score_ 

In [None]:
rfc.fit(X_train, train_label)

In [None]:
ytrain_pred_rfc = rfc.predict(X_train)
ytest_pred_rfc = rfc.predict(X_test)

In [None]:
rfc_cm = confusion_matrix(test_label, ytest_pred_rfc)
rfc_report = classification_report(test_label, ytest_pred_rfc)

## Final Model Comparison and Selection

In [None]:
print('Logistic Regression Confusion matrix: ')
print(logreg_cm)
print('Decision tree confusion matrix: ')
print(dtc_cm)
print('Random Forest Confusion Matrix: ')
print(rfc_cm)

In [None]:
print('Logistic Regression Accuracy: ', accuracy_score(test_label, ytest_pred_log)*100)
print('Decision Tree Accuracy: ', accuracy_score(test_label, ytest_predict_dtc)*100)
print('Random Forest Accuracy: ', accuracy_score(test_label, ytest_pred_rfc)*100)

In [None]:
print('Logistic Regression Report: ')
print(logreg_report)
print('Decision Tree Report: ')
print(dtc_report)
print('Random Forest Report: ')
print(rfc_report)

### Insight:
- Out of the two models chosen, the Random Forest Classifier gives a relatively higher Accuracy than the other two.
- The recall is highest for the Random forest Classifier 