In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#

# NULL VALUES

In [None]:
a=df.isnull().sum()
b=(df.isnull().sum()/len(df))*100
pd.concat([a,b],axis=1)

In [None]:
df.hist(figsize=(20,15),grid=False)

# Grouping Num and Cat Cols

In [None]:
df.info()
num_col=[]
cat_col=[]
for col in df.select_dtypes(exclude='object'):
    num_col.append(col)
for col in df.select_dtypes(include='object'):
    cat_col.append(col)

In [None]:
#clean cat_col
for col in cat_col[2:]:
    print(f'{col} ----->{df[col].isnull().sum()}------->{df[col].nunique()}')
    print(df[col].unique())
#convert date columns
#Convert to 0;s and 1's


# Preprocessing Cat COls

In [None]:
df['Date']=pd.to_datetime(df['Date'])
df['month']=df['Date'].dt.month
df['day']=df['Date'].dt.day
df['year']=df['Date'].dt.year


rain={'No':0,'Yes':1}
df['RainToday']=df['RainToday'].map(rain)
df['RainTomorrow']=df['RainTomorrow'].map(rain)


In [None]:
wind=['WindGustDir','WindDir9am','WindDir3pm']
fig, ax = plt.subplots(1,3, figsize = (20,5), constrained_layout = True)


for i, col in enumerate(wind): 
    sns.countplot(df[col], ax = ax[i],hue=df['RainTomorrow'])
    ax[i].set_ylabel(col)
    ax[i].set_xlabel(None)
    
    

In [None]:
sns.countplot(df['RainToday'],hue=df['RainTomorrow'])

# Null Values Impute

In [None]:

#lets fill the categorical data
for col in cat_col[1:]:
    df[col].fillna(df[col].mode()[0],axis=0,inplace=True)
for col in num_col:
    df[col].fillna(df[col].median(),axis=0,inplace=True)    

In [None]:
plt.figure(figsize=(25,10))
sns.heatmap(df.corr(),annot=True,cmap="coolwarm",)


#MInTemp,MaxTemp
#Pressure9
#Temp9

In [None]:
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
fig = plt.figure(figsize=(14,10))
c=1
for i in num_col:
    plt.subplot(4, 4, c)
   # plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
    #plt.xlabel(i)
    sns.boxplot(df[i],
    linewidth=1)
    c = c + 1
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()


# Outliers Removal

In [None]:
#outliers removal

df_train=df.copy()
for col in ['MinTemp','MaxTemp','WindGustSpeed','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm']:
    
    # Lower and upper threshold
    q1 = df[col].quantile(.25)
    q3 = df[col].quantile(.75)
    print(f'{col}----->{q1}----->{q3}')
    iqr=q3-q1
    lower_threshold=q1-1.5*iqr
    upper_threshold=q3+1.5*iqr
    print(f'{col}----->{lower_threshold}----->{upper_threshold}')
    df_train=df_train[(df_train[col]>=lower_threshold) & (df_train[col]<=upper_threshold)]
    print(df_train.shape)
    

In [None]:
df_train.drop('Date',axis=1,inplace=True)

# Chi2 to check for dependence

In [None]:
df_train[cat_col[1:]]
import scipy.stats as stats
def chi2(data,col):
    contigency_data = pd.crosstab(data[col],data['RainTomorrow'])

    stat,pvalue,dof,exp =stats.chi2_contingency(contigency_data)

    print('stat=%.3f, p=%.3f' % (stat, pvalue))
    if pvalue > 0.05:
        print(f'No effect of {col} on target')
    else:
        print(f'There is a effect of {col} on target')
        
for col in cat_col[1:]:
       chi2(df_train,col)     

# getting dummies

In [None]:
dummies=pd.get_dummies(df[['WindGustDir','WindDir9am','WindDir3pm']],drop_first=True)
df_train=df_train.join(dummies)
df_train.drop(['Location','WindGustDir','WindDir9am','WindDir3pm'],axis=1,inplace=True)

# Data Splitting

In [None]:
from keras.layers import Dense, BatchNormalization, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from keras import callbacks
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
X = df_train.drop(["RainTomorrow"], axis=1)
y = df_train["RainTomorrow"]

# Splitting test and training sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

X.shape
scaler = StandardScaler()
_ = scaler.fit(X_train)
X_trn = scaler.transform(X_train)
X_val = scaler.transform(X_valid)


In [None]:
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, 
    patience=20, 
    restore_best_weights=True,
)



model=Sequential()
model.add(Dense(units=32,kernel_initializer='he_uniform',activation = 'relu', input_dim = 65))
model.add(BatchNormalization())
model.add(Dense(units=16,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=8,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=4,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=1,kernel_initializer='he_uniform',activation = 'sigmoid'))

model.compile(optimizer='adam',loss = 'binary_crossentropy', metrics = ['accuracy'])
history = model.fit(X_trn, y_train, epochs=100,validation_split=0.3,callbacks=[early_stopping],batch_size=30)

In [None]:
history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['loss']], "#BDE2E2", label='Training loss')
plt.plot(history_df.loc[:, ['val_loss']],"#C2C4E2", label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="best")

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5)
plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_valid, y_pred)
sns.heatmap(cf_matrix, annot = True, annot_kws = {'size':20})


In [None]:
print(classification_report(y_valid, y_pred))

#we can see that  recall rate is poor.
print(df_train['RainTomorrow'].value_counts())#80:20

In [None]:
X.columns

# ****Checking for Features

In [None]:
from sklearn.feature_selection import f_classif,SelectKBest,chi2,mutual_info_classif 

fs = SelectKBest(score_func=mutual_info_classif, k='all')
# learn relationship from training data
fs.fit(df_train[num_col],df_train['RainTomorrow'])
# transform train input data
X_train_fs = fs.transform(df_train[num_col])
# transform test input data



In [None]:
fs.scores_


In [None]:
for i in range(len(fs.scores_)):
	print('Feature %d: %f' % (i, fs.scores_[i]))


In [None]:
feat_imp=pd.DataFrame(index=df_train[num_col].columns,data=fs.scores_)
feat_imp.sort_values(0,ascending=False).plot(kind='barh')

In [None]:
feat_imp[feat_imp>0.015].index

In [None]:
X_new=df_train[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm']]
X_new

In [None]:
dummies=pd.get_dummies(df[['WindGustDir','WindDir9am','WindDir3pm']],drop_first=True)
X_new=X_new.join(dummies)
X_new

In [None]:
y_new=df_train['RainTomorrow']
# Splitting test and training sets
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new, test_size = 0.3, random_state = 42)


scaler = MinMaxScaler()
_ = scaler.fit(X_train)
X_trn = scaler.transform(X_train)
X_val = scaler.transform(X_valid)

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, 
    patience=20, 
    restore_best_weights=True,
)



model=Sequential()
model.add(Dense(units=32,kernel_initializer='he_uniform',activation = 'relu', input_dim = 61))
model.add(BatchNormalization())
model.add(Dense(units=16,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=8,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=4,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=1,kernel_initializer='he_uniform',activation = 'sigmoid'))

model.compile(optimizer='adam',loss = 'binary_crossentropy', metrics = ['accuracy'])
history = model.fit(X_trn, y_train, epochs=100,validation_split=0.3,callbacks=[early_stopping],batch_size=30)

history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['loss']], "#BDE2E2", label='Training loss')
plt.plot(history_df.loc[:, ['val_loss']],"#C2C4E2", label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="best")

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5)
#plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_valid, y_pred)
cf_matrix

# since it is imbalance class using undersampling

In [None]:
'''
mapping=df.groupby(['Location'])['RainTomorrow'].sum().to_dict()
df_train['Location']=df['Location'].copy()
df_train['Loc_train']=df_train['Location'].map(mapping)
X_new['Loc_freq']=df_train['Loc_train']
'''

In [None]:
X=df_train.drop(df_train[['Location','RainTomorrow']],axis=1)
y=df_train['RainTomorrow']

In [None]:
from imblearn.under_sampling import NearMiss
from collections import Counter
undersample=NearMiss(0.7)
X_nm,y_nm=undersample.fit_resample(X,y)
print(f'before sampling y_shape {Counter(y_new)}')
print(f'after sampling y_shape {Counter(y_nm)}')

In [None]:
#X_nm.drop(X_nm['Loc_freq'],axis=0,inplace=True)
X_nm=X_nm.iloc[:,:-1]


In [None]:
# Splitting test and training sets

X_train, X_valid, y_train, y_valid = train_test_split(X_nm, y_nm, test_size = 0.3, random_state = 42)


scaler = MinMaxScaler()
_ = scaler.fit(X_train)
X_trn = scaler.transform(X_train)
X_val = scaler.transform(X_valid)




early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, 
    patience=10, 
    restore_best_weights=True,
)



model=Sequential()
model.add(Dense(units=64,kernel_initializer='he_uniform',activation = 'relu', input_dim = 65))
model.add(BatchNormalization())
model.add(Dense(units=32,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=16,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=4,kernel_initializer='he_uniform',activation = 'relu'))
model.add(BatchNormalization())
model.add(Dense(units=1,kernel_initializer='he_uniform',activation = 'sigmoid'))

model.compile(optimizer='adam',loss = 'binary_crossentropy', metrics = ['accuracy'])
history = model.fit(X_trn, y_train, epochs=50,validation_split=0.3,callbacks=[early_stopping],batch_size=30)


history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['loss']], "#BDE2E2", label='Training loss')
plt.plot(history_df.loc[:, ['val_loss']],"#C2C4E2", label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="best")

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5)
#plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_valid, y_pred)
cf_matrix
print(classification_report(y_valid, y_pred))

#we can see that  recall rate is poor.

In [None]:
#Eventhough accuracy is less comapred to actual data set,but recall has imporved significantly.
Thankyou

# Upvote it if you find helpful.