In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Imports

In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

pd.set_option("display.max_columns", None)
mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['axes.grid'] = False

In [None]:
#Read the data
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

#### Data Preprocessing and EDA

In [None]:
print("Rows Count     :: ",df.shape[0])
print("\nColumns Count  :: ",df.shape[1])
print("\nFeatures       :: ",df.columns.tolist())
print("\nMissing Values :: \n",df.isnull().any())
print("\nUnique Values  :: \n",df.nunique())

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df['RainTomorrow'].isnull().sum()

In [None]:
df = df.dropna(subset=['RainTomorrow'])

In [None]:
sns.heatmap(df.isnull(),cmap='viridis',cbar=False)

In [None]:
axis = sns.countplot(x='RainTomorrow', data=df)
axis.set_title('Class Distribution for Target Feature', size=18)

for patch in axis.patches:
    axis.text(x = patch.get_x() + patch.get_width()/2, y = patch.get_height()/2,
             s = f"{np.round(patch.get_height()/len(df)*100, 1)}%",
             ha = 'center', size = 40, rotation = 0, weight = 'bold',
             color = 'white')
axis.set_xlabel('Rain Tomorrow', size=14)
axis.set_ylabel('Count', size=14)
plt.show()

In [None]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

df['month'] = df['Date'].dt.month
df = encode(df,'month',12)

df['day'] = df['Date'].dt.day
df = encode(df,'day',31)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, train_size = 0.8, random_state=101, stratify = df['RainTomorrow'])

### *Cleaning Categorical Feature*

In [None]:
cat_feature = [feature for feature in df_train.columns if df[feature].dtype=='object']
df_train[cat_feature].isnull().mean()*100

#### *All the missing features have null values less than >10% Lets impute them with the mode*

In [None]:
for feature in cat_feature:
    df_train[feature].fillna(df_train[feature].mode()[0], inplace = True)
    df_test[feature].fillna(df_test[feature].mode()[0], inplace = True)

### Cleaning Numerical Features

In [None]:
num_feature = df.describe().columns.to_list()
df_train[num_feature].isnull().mean()*100

#### *Features 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm' have missing values more than 35% We need to fill them, Lets do some analysis to find best fit for these features*

In [None]:
cols = ['Evaporation','Sunshine','Cloud9am','Cloud3pm']
plt.style.use('seaborn-dark')
fig, ax = plt.subplots(4,2, figsize = (12,8), constrained_layout = True)

for i, num_var in enumerate(cols): 
    sns.kdeplot(data = df_train, x = num_var, ax = ax[i][0],
                fill = True, alpha = 0.6, linewidth = 1.5)
    ax[i][0].set_ylabel(num_var)
    ax[i][0].set_xlabel(None)
    
    sns.histplot(data = df_train, x = num_var, ax = ax[i][1], color='red')
    ax[i][1].set_ylabel(None)
    ax[i][1].set_xlabel(None)
    
fig.suptitle('Features having high missing values (>35%)', size = 16);

#### *Except Evaporation all the three features are distributed data, So I am going to impute these three features with median, and for evaporation I will use mean to fill the missing values*

In [None]:
for dataframe in [df_train, df_test]:
    for feature in ['Sunshine','Cloud9am','Cloud3pm']:
        dataframe[feature].fillna(dataframe[feature].median(), inplace=True)
        
    dataframe['Evaporation'].fillna(dataframe['Evaporation'].mean(), inplace=True)

#### *Now we have missing values less than 10%, I am going to remove those, We also replace these missing data with mean or median*

In [None]:
df_train = df_train.dropna()
df_test = df_test.dropna()

### Multicolinearty

#### *Multicollinearity is the occurrence of high intercorrelations among two or more independent variables in a multiple regression model. Multicollinearity can lead to skewed or misleading results when a researcher or analyst attempts to determine how well each independent variable can be used most effectively to predict or understand the dependent variable in a statistical model.*

In [None]:
numeric_col = ['MinTemp', 'MaxTemp', 'Rainfall','WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
               'Humidity9am','Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm',
              'Sunshine', 'Cloud9am', 'Cloud3pm', 'Evaporation']
plt.figure(figsize=(16,12))
axis=sns.heatmap(df[numeric_col].corr(), cmap='coolwarm',annot=True, linewidths=3, square=True, fmt='.0%')

axis.set_title('Corelation Between the features', size=16)
axis.set_xticklabels(numeric_col, fontsize=12)
axis.set_yticklabels(numeric_col, fontsize=12, rotation=0);

#### Strong Corelations


##### pressure3pm and pressure9am

##### temperature9am and minTemp

##### temperature9am and maxTemp

##### temperature3pm and maxTemp

##### temperature3pm and temperature9am

We have to take only one from these


In [None]:
# Droping the columns
for dataframe in [df_train, df_test]:
    dataframe.drop(['Temp3pm', 'Pressure3pm', 'Temp9am'], axis = 1, inplace = True)

### Outliers


In [None]:
numeric_col = ['MinTemp', 'MaxTemp', 'Rainfall','WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
               'Humidity9am','Humidity3pm', 'Pressure9am', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Evaporation']

plt.style.use('seaborn')
fig, axis = plt.subplots(13, 2, figsize = (12, 24))
for i, num_var in enumerate(numeric_col):
    
    # Checking for the outliers using boxplot
    sns.boxplot(y = num_var, data = df_train, ax = axis[i][0], color = 'skyblue')
    
    # Checking for the distribution using kdeplot
    sns.kdeplot(x = num_var, data = df_train, ax = axis[i][1], color = 'skyblue',
               fill = True, alpha = 0.6, linewidth = 1.5)
    
    axis[i][0].set_ylabel(f"{num_var}", fontsize = 12)
    axis[i][0].set_xlabel(None)
    axis[i][1].set_xlabel(None)
    axis[i][1].set_ylabel(None)

fig.suptitle('Analysing Numeric Features', fontsize = 16, y = 1)
plt.tight_layout()

##### *Many numeric features have data points beyond IQR. I am considering a threshold of 5 percentile, for outlier removal, i.e any point beyound 95 percentile and below 5 percentile is considerd as outlier and will be removed.*

##### *The threshold of 5 percentile is choosen at random, you can very well consider other values for the threshold also.*

In [None]:
threshold = 0.05
for col in numeric_col:
    
    # Lower and upper threshold
    lower_threshold = df_train[col].quantile(threshold)
    upper_threshold = df_train[col].quantile(1-threshold)
    
    # Dropping the values below lower threshold and beyond upper threshold
    df_train = df_train[(df_train[col]>=lower_threshold) & (df_train[col]<=upper_threshold)]
    df_test = df_test[(df_test[col]>=lower_threshold) & (df_test[col]<=upper_threshold)]

In [None]:
df_train.head(3)

### Feature Transformation

In [None]:
#Converting 'Yes' to 1 and 'No' to 0 in our target column
df_train['RainTomorrow'] = df_train['RainTomorrow'].map(dict({'Yes':1, 'No':0}))
df_test['RainTomorrow'] = df_test['RainTomorrow'].map(dict({'Yes':1, 'No':0}))

In [None]:
# Dropping the features not required for model
df_train.drop(['Date', 'day', 'month'], axis = 1 ,inplace = True)
df_test.drop(['Date', 'day', 'month'], axis = 1 ,inplace = True)

In [None]:
# Splitting the data into y and X
y_train = df_train.pop('RainTomorrow')
X_train = df_train

y_test = df_test.pop('RainTomorrow')
X_test = df_test

In [None]:
# Now the data is ready for preprocessing, let's convert categorical variables into one hot encoding
X_train = pd.get_dummies(X_train, drop_first = True).reset_index(drop = True)
X_test = pd.get_dummies(X_test, drop_first = True).reset_index(drop = True)

In [None]:
# Getting the categorical columns
numeric_col = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
               'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
               'Humidity3pm', 'Pressure9am', 'Cloud9am', 'Cloud3pm',
               'month_sin', 'month_cos', 'day_sin', 'day_cos']

categorical_col = [i for i in X_train.columns if i not in numeric_col]

In [None]:
#Lets Scale our data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_scale = pd.DataFrame(sc.fit_transform(X_train[numeric_col]), columns=numeric_col)
X_test_scale = pd.DataFrame(sc.fit_transform(X_test[numeric_col]), columns=numeric_col)

In [None]:
# Creating final train and test data
X_train_final = pd.concat([X_train_scale, X_train[categorical_col]], axis = 1)
X_test_final = pd.concat([X_test_scale, X_test[categorical_col]], axis = 1)

In [None]:
print(X_train_final.shape)
print(X_test_final.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#ML imports
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score
import matplotlib

#ANN Imports
from tensorflow.keras.models import Sequential
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout,  BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping


### *Model Building*

In [None]:
early_stopping = EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=25)

In [None]:
model=Sequential()
model.add(Dense(1024, kernel_initializer = 'uniform', activation='relu',input_dim = X_train_final.shape[1]))
model.add(Dense(512, kernel_initializer = 'uniform', activation='relu',input_dim = X_train_final.shape[1]))
model.add(Dense(256, kernel_initializer = 'uniform', activation='relu',input_dim = X_train_final.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(128, kernel_initializer = 'uniform', activation='relu',input_dim = X_train_final.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer = 'uniform', activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
# Train the ANN
epochs = 50
batch_size = 64
history = model.fit(X_train_final, y_train, batch_size=batch_size,validation_data=(X_test_final,y_test), epochs=epochs, callbacks=[early_stopping], validation_split=0.3)

In [None]:
model_loss = pd.DataFrame(model.history.history)

In [None]:
model_loss.plot()

In [None]:
y_pred = model.predict_classes(X_test_final)
print(classification_report(y_test, y_pred))
print('Accuracy Score : ',accuracy_score(y_test,y_pred))

### Support Vector Machine

In [None]:
classifier_svc = SVC()

In [None]:
classifier_svc.fit(X_train_final,y_train)

In [None]:
y_svc = classifier_svc.predict(X_test_final)
print(classification_report(y_test, y_svc))
print('Accuracy Score : ',accuracy_score(y_test,y_svc))


### Random Forest

In [None]:
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X_train_final,y_train)
y_rf=classifier_rf.predict(X_test_final)

In [None]:
print(classification_report(y_test, y_rf))
print('Accuracy Score : ',accuracy_score(y_test,y_rf))

### AdaBoost

In [None]:
classifier_ab = AdaBoostClassifier()
classifier_ab.fit(X_train_final,y_train)
y_ab=classifier_ab.predict(X_test_final)

In [None]:
print(classification_report(y_test, y_ab))
print('Accuracy Score : ',accuracy_score(y_test, y_ab))

### Gradient Boosting 

In [None]:
classifier_gb = GradientBoostingClassifier()
classifier_gb.fit(X_train_final,y_train)
y_gb=classifier_gb.predict(X_test_final)
print(classification_report(y_test, y_gb))
print('Accuracy Score : ',accuracy_score(y_test, y_gb))

### Naive Bayes

In [None]:
classifier_nb = GaussianNB()
classifier_nb.fit(X_train_final,y_train)
y_nb=classifier_nb.predict(X_test_final)
print(classification_report(y_test, y_nb))
print('Accuracy Score : ',accuracy_score(y_test, y_nb))

In [None]:
ann_df = pd.DataFrame(data=[f1_score(y_test,y_pred),accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)], 
             columns=['Artificial Neural Network'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

svc_df = pd.DataFrame(data=[f1_score(y_test,y_svc),accuracy_score(y_test, y_svc), recall_score(y_test, y_svc),precision_score(y_test, y_svc), roc_auc_score(y_test, y_svc)], 
             columns=['Support Vector Classifier'],index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

rf_df = pd.DataFrame(data=[f1_score(y_test,y_rf),accuracy_score(y_test, y_rf), recall_score(y_test, y_rf), precision_score(y_test, y_rf), roc_auc_score(y_test,y_rf)], 
             columns=['Random Forest'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

ab_df = pd.DataFrame(data=[f1_score(y_test,y_ab),accuracy_score(y_test, y_ab), recall_score(y_test, y_ab), precision_score(y_test, y_ab), roc_auc_score(y_test, y_ab)], 
             columns=['Adaboost'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
                      
nb_df = pd.DataFrame(data=[f1_score(y_test,y_gb),accuracy_score(y_test, y_gb), recall_score(y_test, y_gb), precision_score(y_test, y_gb), roc_auc_score(y_test,y_gb)], 
             columns=['Naive Bayes'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
                      
gb_df = pd.DataFrame(data=[f1_score(y_test,y_gb),accuracy_score(y_test, y_gb), recall_score(y_test, y_gb), precision_score(y_test, y_gb), roc_auc_score(y_test,y_gb)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

df_models = round(pd.concat([ann_df,svc_df,rf_df,ab_df,nb_df,gb_df], axis=1),3)
colors = ["bisque","ivory","sandybrown","steelblue","lightsalmon"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "white"

fig = plt.figure(figsize=(18,26)) # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])

sns.heatmap(df_models.T, cmap=colormap,annot=True,fmt=".1%",vmin=0,vmax=0.95, linewidths=2.5,cbar=False,ax=ax0,annot_kws={"fontsize":16})
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

ax0.text(0,-0.5,'Model Comparison',fontsize=20,fontweight='bold',fontfamily='serif')
plt.show()