In [None]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
#Read the csv files and assign to dataframe called weather
weather = pd.read_csv('../input/australian-weather-dataset/weatherAUS.csv')

In [None]:
#Getting info of the dataframe
weather.info()

In [None]:
#Check overview of the dataset
weather.head()

**EXPLORATORY DATA ANALYSIS**

In [None]:
#Check the distribution of the target
sns.countplot(x='RainTomorrow', data=weather)
plt.show()

The distribution of the target is not well-balanced since most of the datas didn't rain

**FEATURE ENGINEERING**

In [None]:
#Column date in a string format, I want to convert it to datetime format
weather['Date'] = pd.to_datetime(weather['Date'])

In [None]:
#Extract year from date
weather['Year'] = weather['Date'].dt.year

In [None]:
# function to encode datetime into cyclic parameters. 
#As I am planning to use this data in a neural network I prefer the months and days in a cyclic continuous feature. 

def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [None]:
#Extract month and day from date column
weather['month'] = weather['Date'].dt.month
weather['day'] = weather['Date'].dt.day

In [None]:
#Encode day and month into a cyclic format
weather = encode(weather, 'month', 12)
weather = encode(weather, 'day', 31)

In [None]:
#Plot the cycle
section = weather[:360]
tm = section['day'].plot()
tm.set_title('Year and Month Curve')
tm.set_xlabel('Days in Year')
tm.set_ylabel('Days in Month')
plt.show()

In [None]:
#Find column with object 
cat_columns = list(weather.select_dtypes(['object']).columns)
cat_columns

In [None]:
#Find how many null values in categorical column
weather.isnull().sum()[cat_columns]

In [None]:
#Fill the null values with mode in that column
for column in cat_columns:
    weather[column].fillna(weather[column].mode()[0], inplace=True)

In [None]:
#Check null values in numerical column
num_columns = list(weather.select_dtypes(['float64', 'int64']).columns)
weather.isnull().sum()[num_columns]

In [None]:
#Fill the null values with median
for column in num_columns:
    weather[column].fillna(weather[column].median(), inplace=True)

In [None]:
#check windgust speed each year
bar = weather.groupby('Year').mean()['WindGustSpeed'].plot(kind='bar')
bar.set_ylabel('WindGustSpeed')
plt.show()

In [None]:
#Check the mean of rainfall in each year
rainfall = weather.groupby("Year").mean()['Rainfall'].plot()
rainfall.set_ylabel('Rainfall')
plt.show()

In [None]:
#Label encode the location column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
weather['Location'] = label_encoder.fit_transform(weather['Location'])

In [None]:
weather['RainToday'] = weather['RainToday'].map({'Yes':1, 'No':0})
weather['RainTomorrow'] = weather['RainTomorrow'].map({'Yes':1, 'No':0})

In [None]:
#Use one hot encoding to other columns
weather = pd.get_dummies(weather, drop_first=True)

In [None]:
#Check the type if all columns are numerical
weather.select_dtypes(['object']).columns

**BUILDING DEEP LEARNING MODEL**

In [None]:
#Set the features and target of deep learning model
X = weather.drop(['RainTomorrow', 'Date', 'day', 'month'], axis=1)
y = weather['RainTomorrow']

In [None]:
#Split into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#CHECK THE STATISTICAL SUMMARY OF DATASET
weather.describe()

We can see that some columns have a wide range of values in the data. I will use min max scaler to scale the data.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
# Import the sequential model and dense layer
from keras.models import Sequential
from keras.layers import Dense

# Create a sequential model
model = Sequential()

# Add a hidden layer 
model.add(Dense(200, input_shape=(68,), activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))

#Add output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Import the early stopping callback
from keras.callbacks import EarlyStopping

# Define a callback to monitor val_acc
early_stop = EarlyStopping(monitor='val_loss', mode='min', 
                       patience=5)

# Train the model using the early stopping callback
history = model.fit(X_train, y_train, 
           epochs=30, validation_data=(X_test, y_test),
          batch_size=32, callbacks= [early_stop])

In [None]:
#Plot the training and validation loss
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

In [None]:
# Evaluate model accuracy on the test set
accuracy = model.evaluate(X_test, y_test)[1]

In [None]:
print('Accuracy:', accuracy)