In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
df= pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.hist(figsize=(20,15))

# Dealing with Null values

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp1= IterativeImputer(initial_strategy='mean')
imp2= IterativeImputer(initial_strategy='median')

df[['Evaporation']]= imp2.fit_transform(df[['Evaporation']])
df[['Cloud3pm']]= imp2.fit_transform(df[['Cloud3pm']])
df[['Sunshine']]= imp1.fit_transform(df[['Sunshine']])
df[['Cloud9am']]= imp2.fit_transform(df[['Cloud9am']])
df[['Pressure9am']]=imp1.fit_transform(df[['Pressure9am']])
df[['Pressure3pm']]=imp1.fit_transform(df[['Pressure3pm']])

In [None]:
df.isnull().sum()

In [None]:
df['Date']= pd.to_datetime(df['Date'])
df['Year']= df['Date'].apply(lambda x:x.year)
df['Month']=df['Date'].apply(lambda x:x.month)
df['Day']=df['Date'].apply(lambda x:x.day)
df.drop('Date', axis=1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)
df.info()

In [None]:
i=1
plt.figure(figsize=(20,10))
for col in df.columns:
    if df[col].dtype=='object':
        plt.subplot(3,2,i)
        sns.countplot(df[col])
        i+=1

# Encoding the categorical variables

In [None]:
cat_cols= [col for col in df.columns if df[col].dtype=='object']
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()

for col in cat_cols:
    df[col]= le.fit_transform(df[col])

df.head()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=False)

In [None]:
np.abs(df.corr()['RainTomorrow']).sort_values(ascending=False)

In [None]:
sns.countplot(df['RainTomorrow'])

In [None]:
sns.countplot(df['RainToday'])

# Sampling the data
Since data is very disproportionately distributed, we will use resample to equalize the number of samples for each case

In [None]:
from sklearn.utils import resample, shuffle
zero = df[df['RainTomorrow']==0]
one= df[df['RainTomorrow']==1]

upsampled= resample(one, replace=True, n_samples= zero.shape[0])

df= pd.concat([zero, upsampled])
df= shuffle(df)
df.head()

In [None]:
sns.countplot(df['RainTomorrow'])

In [None]:
zero = df[df['RainToday']==0]
one= df[df['RainToday']==1]

upsampled= resample(one, replace=True, n_samples= zero.shape[0])

df= pd.concat([zero, upsampled])
df= shuffle(df)
df.head()

In [None]:
sns.countplot(df['RainToday'])

In [None]:
df.hist(figsize=(20,20))

In [None]:
X= df.drop('RainTomorrow', axis=1)
y= df['RainTomorrow']

# Selecting the best features

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

skb= SelectKBest(score_func= f_classif, k=15)
X_new= skb.fit_transform(X, y)

X_new.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

ss=StandardScaler()

X_train, X_test, y_train, y_test= train_test_split(X_new,y,test_size=0.2, stratify=y)
X_train= ss.fit_transform(X_train)
X_test= ss.transform(X_test)

# Building our Model

In [None]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

In [None]:
model= Sequential()

model.add(Dense(1024, activation='relu', input_dim= X_new.shape[1]))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
early_stopping= EarlyStopping(patience=15, monitor='val_loss')

model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
history= model.fit(X_train, np.array(y_train), validation_split=0.1,batch_size=100, epochs=100, verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

As we can see, the val loss decreases but then increases again. But since the val accuracy is increasing, we are going to ignore it

In [None]:
model.evaluate(X_train, y_train)

In [None]:
model.evaluate(X_test,y_test)

In [None]:
predictions= model.predict(X_test)

In [None]:
predictions[:10]

In [None]:
predict=[]
for i in predictions:
    if i<0.5:
        predict.append(0)
    else:
        predict.append(1)
predict[:10]

# Final Accuracy

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predict))

# Upvote and Comment if you liked my notebook :)