# Australia Rain Prediction using a Neural Network Model

### 1. What do we want to achieve?

In this notebook, I will create a classification model using an Artificial Neural Network (ANN) to determine whether or not it will rain tomorrow in Australia.

I've used the "Rain in Australia" dataset for this project.



### 2. Import Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from keras.layers import Dense, BatchNormalization, Dropout, LSTM
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from keras import callbacks
from keras.optimizers import Adam

np.random.seed(0)

### 3. Import Dataset

In [None]:
data = ('../input/weather-dataset-rattle-package/weatherAUS.csv')

df = pd.read_csv(data)

### 4. EDA (Exploratory Data Analysis)

#### Finding out the general format of the data

In [None]:
df.shape

In [None]:
df.head()

In [None]:
col_names = df.columns

col_names

In [None]:
df.info()

##### Things to note:
- We can see that the dataset contains a mixture of categorical and numerical variables.
- Categorical variables are type 'object'
- Numerical variables are type 'float64'
- There are quite a lot of missing values in the dataset.

In [None]:
df.describe()

#### 5. Univariate Analysis

##### Explore 'RainTomorrow' variable

Check for missing values

In [None]:
df['RainTomorrow'].isnull().sum()

Check for, and then view unique values

In [None]:
df['RainTomorrow'].nunique()

In [None]:
df['RainTomorrow'].unique()

The two unique values are 'No' and 'Yes'.

Let's visualise this.

In [None]:
q = sns.countplot(x = df['RainTomorrow'], palette = 'crest')
q.set(xlabel = 'Value')
q.set(ylabel = 'Count')
q.set(title = 'Count of each unique value')
plt.tight_layout()

#### 6. Data Visualisation and Cleanup


**Now I will parse Dates into datetime**

In [None]:
# Parsing datetime
# exploring the length of date objects
lengths = df["Date"].str.len()
lengths.value_counts()

In [None]:
# As far as we can see from the value above, there are no errors. Now we can parse into datetime
df['Date']= pd.to_datetime(df['Date'])
# Creating a 'Year' column
df['year'] = df.Date.dt.year

# Now we will create a function to encode datetime into cyclic parameters.
# This data will be used in a neural network, therefore having months and days in a cyclic continuous feature will make things easier for us.

def encode(df, col, max_val):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_val)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_val)
    return df

df['month'] = df.Date.dt.month
df = encode(df, 'month', 12)

df['day'] = df.Date.dt.day
df = encode(df, 'day', 31)

df.head()


Next, I will deal with missing values in categorical and numeric attributes separately.

**Categorical Variables**

We're going to fill missing values with the mode of the column value

In [None]:
# View list of categorical variables
s = (df.dtypes == "object")
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
# Missing values in categorical variables

for i in object_cols:
    print(i, df[i].isnull().sum())

In [None]:
# Filling missing values with the mode

for i in object_cols:
    df[i].fillna(df[i].mode()[0], inplace=True)

**Numerical variables**

Filling missing numerical values with the median of the column value

In [None]:
# View list of numerical variables
t = (df.dtypes == 'float64')
num_cols = list(t[t].index)

print('Numerical Variables:')
print(num_cols)

In [None]:
# Amount of missing values

for i in num_cols:
    print(i, df[i].isnull().sum())

In [None]:
# Filling missing values with the median of the column value

for i in num_cols:
    df[i].fillna(df[i].median(), inplace=True)
    
df.info()

#### Data Preprocessing
**Steps involved:**
- Label encoding columns with categorical data
- Perform the scaling of the features
- Detecting outliers
- Dropping the outliers based on data analysis

**Label encoding the categorical variable**

In [None]:
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for i in object_cols:
    df[i] = label_encoder.fit_transform(df[i])
    
df.info()

In [None]:
# Preparing attributes of scale data
# Dropping extra columns
features = df.drop(['RainTomorrow', 'Date','day', 'month'], axis=1)

# Defining our target columns

target = df['RainTomorrow']

# Set up a standard scaler for the features
col_names = list(features.columns)
s_scaler = preprocessing.StandardScaler()
features = s_scaler.fit_transform(features)
features = pd.DataFrame(features, columns=col_names)

features.describe().T

In [None]:
# Detecting outliers in the data by looking at the scaled features

plt.figure(figsize=(20,10))
sns.boxenplot(data = features,palette = 'pastel')
plt.xticks(rotation=90)
plt.show()

In [None]:
features['RainTomorrow'] = target

# Dropping outliers

features = features[(features["MinTemp"]<2.3)&(features["MinTemp"]>-2.3)]
features = features[(features["MaxTemp"]<2.3)&(features["MaxTemp"]>-2)]
features = features[(features["Rainfall"]<4.5)]
features = features[(features["Evaporation"]<2.8)]
features = features[(features["Sunshine"]<2.1)]
features = features[(features["WindGustSpeed"]<4)&(features["WindGustSpeed"]>-4)]
features = features[(features["WindSpeed9am"]<4)]
features = features[(features["WindSpeed3pm"]<2.5)]
features = features[(features["Humidity9am"]>-3)]
features = features[(features["Humidity3pm"]>-2.2)]
features = features[(features["Pressure9am"]< 2)&(features["Pressure9am"]>-2.7)]
features = features[(features["Pressure3pm"]< 2)&(features["Pressure3pm"]>-2.7)]
features = features[(features["Cloud9am"]<1.8)]
features = features[(features["Cloud3pm"]<2)]
features = features[(features["Temp9am"]<2.3)&(features["Temp9am"]>-2)]
features = features[(features["Temp3pm"]<2.3)&(features["Temp3pm"]>-2)]

features.shape

In [None]:
# Looking at the scaled features without outliers

plt.figure(figsize = (20,10))
sns.boxenplot(data = features, palette = 'pastel')
plt.xticks(rotation = 90)
plt.show()

We're now looking at a consistent dataframe which is perfect for building a neural network.

## Model Building

**Method of approach** 
- Assigning X and y the status of 'Attributes' and 'Tags'
- Splitting test and training sets
- Initialising the neural network
- Defining by adding layers to the network
- Compiling the neural network
- Training the neural network

In [None]:
X = features.drop(['RainTomorrow'], axis=1)
y = features["RainTomorrow"]

# Splitting the test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X.shape

In [None]:
# Early stopping

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, #minimum amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

# Initialising the NN
model = Sequential()

# Layers

model.add(Dense(units = 32, kernel_initializer = 'uniform', activation = 'relu', input_dim = 26))
model.add(Dense(units = 32, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 16, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
opt = Adam(learning_rate=0.00009)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

# Train the ANN

history = model.fit(X_train, y_train, batch_size = 32, epochs = 150, callbacks=[early_stopping], validation_split=0.2)

In [None]:
history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['loss']], "#BDE2E2", label='Training loss')
plt.plot(history_df.loc[:, ['val_loss']],"#C2C4E2", label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="best")

plt.show()

In [None]:
history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['accuracy']], "#BDE2E2", label='Training accuracy')
plt.plot(history_df.loc[:, ['val_accuracy']], "#C2C4E2", label='Validation accuracy')

plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

## Conclusions

**Concluding the model with:**
- Testing on the test set
- Evaluating the confusion matrix
- Evaluating the classification report

In [None]:
# Predicting the test set results
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

In [None]:
# confusion matrix
cmap1 = sns.diverging_palette(260,-10,s=50,l=75,n=5, as_cmap=True)
plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})

In [None]:
print(classification_report(y_test, y_pred))