In [None]:
#import libraries
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sb

import plotly.graph_objects as go
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# read the dataset
df=pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
df.describe()

**Here we can see that some features have a lot of outliners.**

# Missing Values

In [None]:
# we will sum up the missing values of each column and also find its percentage

missing = df.isnull().sum()

missing_per = (missing/df.shape[0]) * 100

dic = {'no. of missing values': missing, 'missing_percentage':missing_per}

desc = pd.DataFrame(dic)
desc

**We will remove all the features with missing value percentage above than 15.** 

In [None]:
df=df.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'], axis = 1)

**Year and month can give us important information about the rain so we will separate the month and year from date**.

In [None]:
df['Date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')

In [None]:
df['year']=df['Date'].dt.year
df['month']=df['Date'].dt.month
df['date']=df['Date'].dt.date
df=df.drop('Date',axis=1)

In [None]:
# seperating the numerical and categorical data 
df_cat=df.select_dtypes('object').columns
df_num=df.select_dtypes('float64').columns

In [None]:
df_cat

In [None]:
df_num

**We will fill the missing values of numerical feature using mean**

In [None]:
for i in df_num:
    df[i].fillna(df[i].mean(), inplace=True)

**We will fill the missing values of categorical features using mode**

In [None]:
for i in df_cat:
    df[i].fillna(df[i].mode()[0], inplace=True)

In [None]:
print(df.isnull().sum())

**Our data is free of missing values now**

# Data Exploration

In [None]:
#copying the dataset 
df_copy=df.copy()

In [None]:
fig = px.pie(df_copy, values='Rainfall', names='Location')
fig.update_traces(textposition='inside')
fig.show()

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, y = 'RainTomorrow', x = 'MaxTemp', color = base_color)

**not much impact of min temperature is on rain tomorrow**

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, y = 'RainTomorrow', x = 'MinTemp', color = base_color)

**not much impact of min temperature is on rain tomorrow**

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, y = 'RainTomorrow', x = 'Humidity9am', color = base_color)

**Humidity at 9am in range of 70-90 can cause rain next day with around 80 maximizing the chances**

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, y = 'RainTomorrow', x = 'Humidity3pm', color = base_color)

**Humidity at 3pm in range of around 60-80 can cause rain next day**

In [None]:
plt.figure(figsize=(15,10))
sb.heatmap(df[df_num].corr(), annot=True)

**Teamp9am(88%) and Temp3pm(97%) has high correlation with MaxTemp so we will drop them**

In [None]:
df.drop('Temp9am',axis=1,inplace=True)
df.drop('Temp3pm',axis=1,inplace=True)

# Feature engineering

In [None]:
df.describe()

**The above table shows that Rainfall, Evaporation, WindGustSpeed, WindSpeed9am, WindSpeed3pm has high outliers**

In [None]:
plt.figure(figsize=(10,6))
df.boxplot(column=['Rainfall','WindGustSpeed','WindSpeed9am','WindSpeed3pm'])

**To treat outliners we will find the lower fence and upper fence of each feature and then remove the extra**

In [None]:
for i in ['Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm']:
    IQR = df[i].quantile(0.75)-df[i].quantile(0.25)
    lower_fence=df[i].quantile(0.25)-(IQR*1.5)
    upper_fence=df[i].quantile(0.75)+(IQR*1.5)
    print("{} has the upper fence : {:0.2f} & lower fence : {:0.2f}".format(i,upper_fence,lower_fence))

In [None]:
df_new=df.copy()

In [None]:
df_new['Rainfall']=df_new['Rainfall'].apply(lambda x: np.where(x>1.500,-0.900,x))
df_new['WindGustSpeed']=df_new['WindGustSpeed'].apply(lambda x: np.where(x>68.500,8.500,x))
df_new['WindSpeed9am']=df_new['WindSpeed9am'].apply(lambda x: np.where(x>37.00,-11.000,x))
df_new['WindSpeed3pm']=df_new['WindSpeed3pm'].apply(lambda x: np.where(x>40.50,-3.500,x))

In [None]:
plt.figure(figsize=(10,6))
df_new.boxplot(column=['Rainfall','WindGustSpeed','WindSpeed9am','WindSpeed3pm'])

In [None]:
df_new.head(10)

In [None]:
df_new.drop('date',inplace=True,axis=1)

In [None]:
df_new

***Encoding***

In [None]:
# we have to encode gender, marital status, work type, residence type, smoking status
# Store the column names in one_hot_var list. 1-hot encoding of these features will be done
one_hot_var = ['Location','WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow']

# perform 1-hot encoding on each column present i n one_hot_var
for i in one_hot_var:
    # perform 1-hot encoding for variable & store it in x_t dataframe
    x_t = pd.get_dummies(df_new[i], prefix=i, prefix_sep='_', drop_first=True) # define new column name, separator and drop 1 of them.
    
    # join x_t to dataset
    df_new = df_new.join(x_t)
    
    # drop original column from dataset as it is no longer needed
    df_new.drop(i, axis = 1, inplace = True)

**Defining the Target Variable**

In [None]:
y=df_new['RainTomorrow_Yes']
X=df_new.drop(labels=['RainTomorrow_Yes'],axis=1)

**Spliting the training and testing data**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

**Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

**Creating a confusion matrix and printing accuracy score**

In [None]:
 cm = confusion_matrix(y_test, y_pred)
classes = ['No rain', 'Raining']
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
hmap = sb.heatmap(df_cm, annot=True, fmt="d")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
plt.ylabel('True label')
plt.xlabel('Predicted label');

accuracy_score(y_test, y_pred)

In [None]:
# Import auc, roc_curve
from sklearn.metrics import auc, roc_curve

# Get false positive rate, true positive rate and threshold
fpr, tpr, threshold = roc_curve(y_test, y_pred)

# Compute Area under curve
area = auc(fpr, tpr)

# plot the roc curve
# Initialize Figure
plt.figure(figsize=(18,5))

plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % area)

# plot straight line
plt.plot([0, 1], [0, 1],color='g', marker='_')

# set plot title, xlabel, ylabel, legend
plt.title('ROC Curve', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=14)
plt.legend(loc = 'lower right', fontsize = 12)

plt.show()

**The confusion matrix, accuracy score and ROC suggest that the model performed well**