## Importing libraries

In [None]:
# For mathematical operations
import numpy as np

# For data analysis
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For encoding categorical variables
from sklearn.preprocessing import LabelEncoder

# For splitting the dataset into train and test set
from sklearn.model_selection import train_test_split

# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# For calculating metrics of the model
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score

from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier

# For plotting the decision tree
from sklearn.tree import plot_tree

# For ensemble modelling
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

## Loading dataset

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

In [None]:
# Find the shape of dataset

df.shape

<b> The dataset has 1,45,460 records and 23 attributes. </b>

In [None]:
# Find the datatype of each attribute

df.info()

<b> The dataset has 16 float and 7 object columns. </b>

In [None]:
# Convert datatype of date object to datetime

df['Date'] = pd.to_datetime(df['Date'])

### Attribute Description:-

<ol>
    <li>Date - The date of observation</li>
    <li>Location - The common name of the location of the weather station</li>
    <li>MinTemp - The minimum temperature in degrees celsius</li>
    <li>MaxTemp - The maximum temperature in degrees celsius</li>
    <li>Rainfall - The amount of rainfall recorded for the day in mm</li>
    <li>Evaporation - The so-called Class A pan evaporation (mm) in the 24 hours to 9am</li>
    <li>Sunshine - The number of hours of bright sunshine in the day.</li>
    <li>WindGustDir - The direction of the strongest wind gust in the 24 hours to midnight</li>
    <li>WindGustSpeed - The speed (km/h) of the strongest wind gust in the 24 hours to midnight</li>
    <li>WindDir9am - Direction of the wind at 9am</li>
    <li>WindDir3pm - Direction of the wind at 3pm</li>
    <li>WindSpeed9am - Wind speed (km/hr) averaged over 10 minutes prior to 9am</li>
    <li>WindSpeed3pm - Wind speed (km/hr) averaged over 10 minutes prior to 3pm</li>
    <li>Humidity9am - Humidity (percent) at 9am</li>
    <li>Humidity3pm - Humidity (percent) at 3pm</li>
    <li>Pressure9am - Atmospheric pressure (hpa) reduced to mean sea level at 9am</li>
    <li>Pressure3pm - Atmospheric pressure (hpa) reduced to mean sea level at 3pm</li>
    <li>Cloud9am - Fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast.</li>
    <li>Cloud3pm - Fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values</li>
    <li>Temp9am - Temperature (degrees C) at 9am</li>
    <li>Temp3pm - Temperature (degrees C) at 3pm</li>
    <li>RainToday - Boolean: 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0</li>
    <li>RainTomorrow - The amount of next day rain in mm. Used to create response variable RainTomorrow. A kind of measure of the "risk".</li>
</ol>

In [None]:
# Statistics for numeric attributes

df.describe()

<b> Simple statistics like count, mean, min, max, etc is calculated for attributes having numeric datatype.<br>
Some of the conclusions drawn from the above table are:<br> </b>
<ol>
    <li>The average minimum temperature is 12.194 and average maximum temperature is 23.221 degree Celcius.</li>
    <li>The mean rainfall is 2.36 mm.</li>
    <li>The average sunshine recieved is 7.61 hour.</li>
    <li>The average wind gust speed is 40.035 km/hr.</li>
    <li>The minimum temperature recorded is -8.5 degree Celcius and the maximum recorded temperature is 48.1 degree Celcius.</li>
    <li>The minimum rainfall recorded for a particular day is 0 mm and maximum is 371 mm.</li>
    <li>The median evaporation is 4.8 mm.</li>
</ol>

In [None]:
# Statistics for object attributes

df.describe(include="object")

<b> The statistics displayed for the attributes of 'object' datatype is different from the one displayed for numeric datatypes.<br>
Some of the conclusions drawn from the above table are:<br> </b>
<ol>
    <li>There are total 49 unique locations and 16 unique wind directions.</li>
    <li>RainToday and RainTomorrow attribute has 2 unique values.</li>
    <li>The top location is Canberra occuring 3436 times.</li>
</ol>

In [None]:
# Check for duplicates

duplicate = df[df.duplicated()]
duplicate

<b> No duplicate records are present. </b>

In [None]:
# Check for null values

df.isnull().sum()

<b>Observations:-</b>
<ol>
    <li>Maximum null values are present in Sunshine column followed by Evaporation.</li>
    <li>More than 55,000 null values are present in Cloud9am and Cloud3pm columns.</li>
    <li>Around 15,000 null values are present in Pressure9am and Pressure3pm columns.</li>
    <li>More than 10,000 null values are present in WindGustDir, WindGustSpeed  and WindDir9am columns</li>
    <li>There are many columns having more than 1000 null records.</li>
</ol>

In [None]:
# Drop Sunshine, Evaporation, Cloud9am and Cloud3pm as most of the values are null

col = ['Sunshine', 'Evaporation', 'Cloud9am', 'Cloud3pm']
df.drop(col, axis=1, inplace=True)

In [None]:
# Replace numerical columns with median
def replace_numerical(df):
    for col in df.select_dtypes(['int', 'float']):
        df[col] = df[col].fillna(df[col].median())
    return df

# Replace object columns with mode
def replace_object(df):
    for col in df.select_dtypes('object'):
        df[col] = df[col].fillna(method='ffill')
    return df

In [None]:
df = replace_numerical(df)
df = replace_object(df)

In [None]:
# Check again if any null values are present

df.isnull().sum()

<b>Hence, all null records are removed from the dataset.</b>

## Exploratory Data Analysis

In [None]:
# PLot Correlation Matrix

corr = df.corr()
corr.style.background_gradient(cmap='PuBu').set_precision(2)

<b>Observation:-</b>

<ul>
    <li>MaxTemp and Temp3pm have a strong positive correlation of 0.97.</li>
    <li>Pressure9am and Pressure3pm have a strong positive correlation of 0.96.</li>
    <li>MinTemp and Temp9am have a strong positive correlation of 0.90.</li>
    <li>MaxTemp and Temp9am have a strong positive correlation of 0.88.</li>
    <li>Temp9am and Temp3pm have a strong positive correlation of 0.85.</li>
    <li>Humidity and Temperature attributes have a negative correlation of 0.50.</li>
</ul>

In [None]:
plt.style.use('seaborn')

# Distribution of location

plt.figure(figsize=(15, 5))
sns.countplot(df['Location'])
plt.xticks(rotation=90)

<b>Observation:-</b>

<ul>
    <li>Most occured location is Canberra followed by Sydney.</li>
    <li>Most of the locations have a frequency near 3000.</li>
    <li>Nhil, Katherine and Uluru have occured the least.</li>
</ul>

In [None]:
# Distribution of MinTemp and MaxTemp

fig, ax = plt.subplots(1, 2, figsize=(15,5))

# MinTemp
sns.distplot(df['MinTemp'], ax=ax[0])
ax[0].set_title("Minimum Temperature")

# MaxTemp
sns.distplot(df['MaxTemp'], ax=ax[1])
ax[1].set_title("Maximum Temperature")

<b>Observation:-</b>

<ul>
    <li>Highest concentration of points for minimum temperature is between 10 to 12 degree Celcius.</li>
    <li>Highest concentration of points for maximum temperature is between 18 to 22 degree Celcius.</li>
</ul>

In [None]:
# MinTemp and MaxTemp of each location

a = df.groupby('Location').agg({'MinTemp':'mean'})
c = df.groupby('Location').agg({'MaxTemp':'mean'})
plt.rcParams["figure.figsize"] = (20,10)

n = df['Location'].nunique()
x = np.arange(n)

loc = df['Location'].unique()

fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])

w = 0.3

ax.bar(x-w/2, a[:]['MinTemp'], label='Average MinTemp by Location', color='maroon', width=w)
ax.bar(x+w/2, c[:]['MaxTemp'], label='Average MaxTemp by Location', color='salmon', width=w)

ax.set_xticks(x)
ax.set_xticklabels(loc, rotation=90)

plt.xlabel('Locations', fontsize=20)
plt.ylabel('Average values', fontsize=20)
plt.title('Average MinTemp and MaxTemp based on Location', fontsize=25)
plt.legend(fontsize=15)

<b>Observation:-</b>

<ul>
    <li>The average maximum temperature is above 20 degree Celcius for most locations.</li>
    <li>The average minimum temperature is between 5 and 15 degree Celcius for most locations.</li>
</ul>

In [None]:
# Rainfall distribution in each month

# Create a new dataframe rain_df
rainfall =[df['Date'].dt.year, df['Date'].dt.month, df['Rainfall']]
headers = ['Year', 'Month', 'Rainfall']
rain_df = pd.concat(rainfall, axis=1, keys=headers)

plt.figure(figsize=(8,4))
a = rain_df.groupby('Month').agg({'Rainfall':'sum'})
a.plot(kind='bar', color='pink')
plt.title('Rainfall distribution in each month', fontsize=25)
plt.xlabel('Month', fontsize=20)
plt.ylabel('Rainfall (in mm)', fontsize=20)
plt.xticks(rotation=0)

<b>Observation:-</b>

<ul>
    <li>Maximum rainfall(greater than 35,000 mm) occurs in March.</li>
    <li>January and June also experience high rainfall(nearly 35,000 mm) followed by February.</li>
    <li>Minimum rainfall occurs in October followed by September.</li>
</ul>

In [None]:
# Rainfall distribution in each year

plt.figure(figsize=(8,4))
a = rain_df.groupby('Year').agg({'Rainfall':'sum'})
a.plot(kind='bar', color='purple')
plt.title('Rainfall distribution in each year', fontsize=25)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Rainfall (in mm)', fontsize=20)
plt.xticks(rotation=0)

<b>Observation:-</b>

<ul>
    <li>Maximum rainfall(greater than 40,000 mm) occured in 2010 followed by 2011 and 2016.</li>
    <li>2009, 2012, 2013, 2014 and 2015 experienced rainfall between 30,000-40,000 mm.</li>
    <li>Least rainfall(less than 200 mm) occured in 2007 followed by 2008 and 2017(greather than 20,000 mm).</li>
</ul>

In [None]:
# Distribution of WindGustDir, WindDir9am and WindDir3pm

fig, ax = plt.subplots(3, 1, figsize=(15,25))

# WindGustDir
sns.countplot(df['WindGustDir'], palette='ocean', ax=ax[0])
ax[0].set_title("Wind Gust Direction", fontsize=20)

# WindGustDir
sns.countplot(df['WindDir9am'], palette='magma_r', ax=ax[1])
ax[1].set_title("Wind Direction at 9AM", fontsize=20)

# WindGustDir
sns.countplot(df['WindDir3pm'], palette='BuGn_r', ax=ax[2])
ax[2].set_title("Wind Direction at 3PM", fontsize=20)

<b>Observation:-</b>

<ul>
    <li>Wind Gust Direction for maximum records(nearly 17,500) is West.</li>
    <li>Wind Direction at 9AM for maximum records is North followed by North-West and East.</li>
    <li>Wind Direction at 3PM for maximum records is South East.</li>
</ul>

In [None]:
# Distribution of WindGustSpeed

plt.figure(figsize=(8,5))
sns.distplot(df['WindGustSpeed'], hist=True, color='yellow')

<b>Observation:-</b>

<ul>
    <li>Wind Gust Speed ranges from 0 to nearly 140 km/hr.</li>
    <li>Highest concentration of points for Wind Gust Speed is between 38-40 km/hr.</li>
</ul>

In [None]:
# Distribution of WindSpeed9am, WindSpeed3pm, Humidity9am, Humidity3pm, Pressure9am, Pressure3pm, Temp9am, Temp3pm

fig, ax = plt.subplots(4, 2, figsize=(15,25))

# WindSpeed9am
sns.distplot(df['WindSpeed9am'], ax=ax[0,0], color='green')
ax[0,0].set_title("Wind Speed at 9AM", fontsize=15)

# WindSpeed3pm
sns.distplot(df['WindSpeed3pm'], ax=ax[0,1], color='green')
ax[0,1].set_title("Wind Speed at 3PM", fontsize=15)

# Humidity9am
sns.distplot(df['Humidity9am'], ax=ax[1,0], color='orange')
ax[1,0].set_title("Humidity at 9AM", fontsize=15)

# Humidity3pm
sns.distplot(df['Humidity3pm'], ax=ax[1,1], color='orange')
ax[1,1].set_title("Humidity at 3PM", fontsize=15)

# Pressure9am
sns.distplot(df['Pressure9am'], ax=ax[2,0], color='red')
ax[2,0].set_title("Pressure at 9AM", fontsize=15)

# Pressure3pm
sns.distplot(df['Pressure3pm'], ax=ax[2,1], color='red')
ax[2,1].set_title("Pressure at 3PM", fontsize=15)

# Temp9am
sns.distplot(df['Temp9am'], ax=ax[3,0], color='brown')
ax[3,0].set_title("Temperature at 9AM", fontsize=15)

# Temp3pm
sns.distplot(df['Temp3pm'], ax=ax[3,1], color='brown')
ax[3,1].set_title("Temperature at 3PM", fontsize=15)

<b>Observation:-</b>

<ul>
    <li>Maximum wind speed at 9AM ranges from 10 to 20 km/hr whereas at 3PM it ranges from 15 to 22 km/hr.</li>
    <li>Highest concentration of points for humidity at 9AM is between 60-80% whereas at 3PM it's 40-70%.</li>
    <li>Highest concentration of points for pressure at 9AM is between 1015-1018 hpa and at 3PM it's between 1015-1017 hpa.</li>
    <li>Maximum temperature at 9AM is between 16-18 degree Celcius and at 3PM it's between 21-23 degree Celcius.</li>
</ul>

In [None]:
# Analyzing RainToday and RainTomorrow

type_plt = pd.crosstab(df['RainToday'], df['RainTomorrow'])

plt.rcParams["figure.figsize"] = (8,5)

type_plt.plot(kind='bar',stacked=False)

plt.xlabel('Rain Today', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Rain Today - Rain Tomorrow', fontsize=20)
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)

<b>Observation:-</b>

<ul>
    <li>For maximum records it didn't rain for both days.</li>
    <li>For nearly 20,000 records it didn't rain today but rained tomorrow and rained for both days.</li>
    <li>For nearly 20,000 records it rained today but didn't rain tomorrow.</li>
</ul>

In [None]:
df.head(2)

In [None]:
# Encoding categorical variables

def label_encoder(x):
    le = LabelEncoder()
    df[x] = le.fit_transform(df[x])

labels = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
for i in labels:
    label_encoder(i)

## Model

### Split the dataset into training and testing set

In [None]:
# convert datatype of Date in DataFrame
df['Date'] = pd.to_datetime(df['Date'],infer_datetime_format=True)
df['Date'] = df['Date'].apply(lambda x: x.toordinal())

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

### Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
print("Score of train data:", dt.score(X_train, y_train))
print("Score of test data:", dt.score(X_test, y_test))

In [None]:
y_pred = dt.predict(X_test)
y_pred

In [None]:
# F1 score

dt_f1_score = f1_score(y_test, y_pred, average='weighted')
dt_f1_score

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Visualising the graph without the use of graphviz

# plt.figure(figsize = (20,10))
# dec_tree = plot_tree(decision_tree=dt, feature_names=df.columns, filled=True , precision=4, rounded=True)

In [None]:
# BaggingClassifier with decision tree

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)

In [None]:
y_pred_bag = bag_clf.predict(X_test)
y_pred_bag

In [None]:
# F1 score and accuracy

bag_f1_score = f1_score(y_test, y_pred_bag, average='weighted')
bag_accuracy = accuracy_score(y_test, y_pred_bag)

print("oob score:", bag_clf.oob_score_)
print("F1 Score:", bag_f1_score)
print("Accuracy Score:", bag_accuracy)

<b> According to this oob evaluation, this BaggingClassifier is likely to achieve about 83.61% accuracy on the test set. The accuracy of the test set is 83.69%, which is close enough. </b>

In [None]:
print(classification_report(y_test, y_pred_bag))

In [None]:
cm = confusion_matrix(y_test, y_pred_bag)

plt.figure(figsize=(5,5))

sns.heatmap(data=cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues')

plt.ylabel('Actual label')
plt.xlabel('Predicted label')

all_sample_title = 'Accuracy Score: {0}'.format(bag_clf.score(X_test, y_test))
plt.title(all_sample_title, size = 15)

<b>Bagging increased the accuracy of Decision Tree Classifier.</b>

### Ensemble Learning

In [None]:
log = LogisticRegression()
rf = RandomForestClassifier()
svm = SVC()

voting_clf = VotingClassifier(estimators=[('lr', log), ('rf', rf), ('svc', svm)], voting='hard')
voting_clf.fit(X_train, y_train)

In [None]:
# Each classifiers accuracy on test set

for clf in (log, rf, svm, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
y_pred_voting = voting_clf.predict(X_test)
y_pred_voting

In [None]:
# F1 score and accuracy

voting_f1_score = f1_score(y_test, y_pred_voting, average='weighted')
voting_accuracy = accuracy_score(y_test, y_pred_voting)

print("F1 Score:", voting_f1_score)
print("Accuracy Score:", voting_accuracy)

In [None]:
print(classification_report(y_test, y_pred_voting))

In [None]:
cm = confusion_matrix(y_test, y_pred_voting)

group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')

<b> VotingClassifier gives an accuracy of 77.53%.</b>