<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Do I Need An Umbrella tomorrow?
</p>
</div> 

### <span style="font-family: Arial;font-size:1.2em;color:#333333">About data set:
* <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">This data set all about the daily weather observations from many locations across Australia and we need to predict whether it will rain tomorrow or not 

### <span style="font-family: Arial;font-size:1.2em;color:#333333">Objective:- 
* <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">Predict next-day rain by training classification models on the target variable RainTomorrow

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 20px;
          color:white;
          text-align:center;">Index
</p>
</div>

* <a href="#Packages">Packages</a> 
* <a href="#Missing-values">Missing values</a>
* <a href="#EDA">EDA</a>
* <a href="#Data-cleaning">Data cleaning</a>
* <a href="#Null-values-imputation">Null values imputation</a>
* <a href="#Outliers">Outliers</a>
* <a href="#Normalize-data">Normalize data</a>
* <a href="#Models">Models</a>
* <a href="#Decision-Tree-based-models">Decision Tree based models</a>
* <a href="#models-comparison">models comparison</a>

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 20px;
          color:white;
          text-align:center;">Packages
</p>
</div>

# Packages

In [None]:
# packages

import numpy as np 
import pandas as pd 

# visual
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# missing data
import missingno as msno

# data imputation
from sklearn.impute import KNNImputer

# to split dataset to train and test
from sklearn.model_selection import train_test_split

# Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Data imputation
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE 

# ANN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.neural_network import MLPClassifier

# ML
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from catboost import CatBoostClassifier, Pool, cv

# ML classifier model Evaluation
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score ,roc_curve,auc
from sklearn.metrics import accuracy_score, recall_score, precision_score, plot_confusion_matrix

# MISC
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
# read data
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head(4)

In [None]:
df.info()

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Missing values in data set
</p>
</div>

# Missing values

#### <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">

In [None]:
# missing values
missing_value = 100 * df.isnull().sum()/len(df)
missing_value = missing_value.reset_index()
missing_value.columns = ['variables','missing values in percentage']
missing_value = missing_value.sort_values('missing values in percentage',ascending=False)

# barplot
fig = px.bar(missing_value, y='missing values in percentage',x='variables',title='Missing values % in each column',
             template='none',text='missing values in percentage');
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')



fig.show()

### <span style="font-family: Arial;font-size:1.2em;color:#333333"> Inference

* <span style="font-family: Arial;font-size:1.2em;color:#333333">Most of the columns have missing values or NaNs which has more than 40% of most column values are empty or missing
* <span style="font-family: Arial;font-size:1.2em;color:#333333"> Sunshine variable has 48% of missing values in the data set	
* <span style="font-family: Arial;font-size:1.2em;color:#333333">Either we can use simple imputer to impute the columns with mean and median or we can use KNN 
* <span style="font-family: Arial;font-size:1.2em;color:#333333">The k-nearest neighbors (KNN) uses the distance metrics to predict the missing value and impute the values


<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">EDA
</p>
</div>

# EDA


In [None]:
# stats
round(df.describe(),2)

In [None]:
plot_rain = df[['MinTemp', 'MaxTemp','Rainfall', 
       'Sunshine','WindGustSpeed', 'WindSpeed3pm','Humidity3pm',
       'Pressure9am', 'Pressure3pm','RainTomorrow']]

sns.pairplot(plot_rain, hue='RainTomorrow' , diag_kind = 'kde');

## <span style="font-family: Arial;font-size:1.2em;color:#333333"> Inference

* <span style="font-family: Arial;font-size:1.2em;color:#333333">Most of the variable is skewed and closely related to each other
* <span style="font-family: Arial;font-size:1.2em;color:#333333">some of the variabel are  exponential distribution in other words time related 

# Skewness

* <span style="font-family: Arial;font-size:1.2em;color:#333333"> Skewness refers to a distortion of data, that deviates from the normal distribution or a bell curve

In [None]:
# skewness of data

features =['MinTemp', 'MaxTemp','Rainfall', 'Evaporation',
       'Sunshine','WindGustSpeed','WindSpeed9am', 'WindSpeed3pm','Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm']

for i in features:
    print(i,'skewness is',df[i].skew())

In [None]:
features =['MinTemp', 'MaxTemp','Rainfall', 'Evaporation',
       'Sunshine','WindGustSpeed','WindSpeed9am', 'WindSpeed3pm','Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm']

for i in features:
   
    fig, ax = plt.subplots(1,3,figsize=(12, 3))

    sns.set_style('white')
    sns.set_context(context = 'notebook',font_scale=1)

    sns.distplot(df[i],bins=10,color='red',kde=False,ax=ax[0]);
    sns.boxplot(df[i], ax = ax[1],color='#ff4e50');
    sns.violinplot(df[i],bins=10,color='#1ebbd9',ax=ax[2]);  
    
    ax[0].title.set_text(i);
    ax[1].title.set_text(i);
    ax[2].title.set_text(i);     
        
    ax[0].axvline(df[i].mean(), color='b', linewidth=1)
    ax[1].axvline(df[i].mean(), color='b', linewidth=1)
    ax[2].axvline(df[i].mean(), color='b', linewidth=1)
        
    ax[0].axvline(df[i].median(), color='r', linewidth=1)
    ax[1].axvline(df[i].median(), color='r', linewidth=1)
    ax[2].axvline(df[i].median(), color='r', linewidth=1)
    
    plt.tight_layout()    

## <span style="font-family: Arial;font-size:1.2em;color:#333333">Inference

* <span style="font-family: Arial;font-size:1.2em;color:#333333">we can observe that in the above box plot outlier are present in the dataset
* <span style="font-family: Arial;font-size:1.2em;color:#333333">most of the variable is skewed positively
* <span style="font-family: Arial;font-size:1.2em;color:#333333">scaling is required for the dataset 
* <span style="font-family: Arial;font-size:1.2em;color:#333333">The blue line indicates the mean of data and the red line indicate the median of the data and we can observe that few variables normally distributed and rest are skewed

## Correlation

In [None]:
plt.figure(figsize=(14,9))
sns.heatmap(df.corr(),annot=True,cmap='Blues',mask=np.triu(df.corr(),+1));
plt.title('Correlation');
plt.tight_layout()

*  <span style="font-family: Arial;font-size:1.2em;color:#333333">MaxTemp variable is highly correlated with Temp9am and with Temp3pm

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Data cleaning
</p>
</div>

# Data cleaning
## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">   


In [None]:
# visual check to ensure null vales are not represented with other values like -999, -1, ?,-111


#for feature in df.columns:
#    print('*******','Column name:',feature,'*******')
#    print(df[feature].unique())
#    print('***********-end-***********')
#    print(' ')

In [None]:
# to date_time
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# checking for duplicated

sim = df.duplicated() 
sim.sum()

In [None]:
# dropping null values of target variable which is just 2.2%
df = df[df['RainTomorrow'].notnull()]

In [None]:
# drop null values of Rain Today variable which is just 2.2%
df = df[df['RainToday'].notnull()]

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Null values imputation
</p>
</div>

# Null values imputation
    
    
#### <span style="font-family: Arial;font-size:1.2em;color:#0e92ea"> Null values imputation based on the percentage of missing values we can impute or drop some null values in variables
* <span style="font-family: Arial;font-size:1.2em;color:#333333"> For 'RainToday', 'RainTomorrow' we can drop the null values and it just around 4.4% of the data set


In [None]:
# knn imputer
knn_imputer = KNNImputer(n_neighbors=3)

In [None]:
# impute missing values using KNNImputer

list_impute = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Temp9am',
       'Temp3pm','Cloud9am', 'Cloud3pm','Evaporation']

for i in list_impute:
    
    df[i] = knn_imputer.fit_transform(df[[i]])

In [None]:
# impute missing values for categorical variables

cat_impute =['WindGustDir', 'WindDir9am', 'WindDir3pm','Location']

for i in cat_impute:
    df[i] = df[i].fillna(df[i].mode()[0])

In [None]:
# null values check
# since I decided to drop the Sunshine I am not treating it
df.isnull().sum()

In [None]:
# converting object into the Categorical data type

for feature in df.columns:
    if df[feature].dtype =='object':
        df[feature] = pd.Categorical(df[feature]).codes

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Outliers
</p>
</div>

# Outliers
#### <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">Outliers in data set are extrem values in the variables 


In [None]:
# outliers
df.plot(kind='box',figsize=(12,6))
plt.xticks(rotation=70);
plt.title('Outlier treated in df');

In [None]:

def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
# removing outliers

for column in df[list_impute].columns:
    lr,ur = remove_outlier(df[column])
    df[column] = np.where(df[column]>ur, ur,df[column])
    df[column] = np.where(df[column]<lr,lr,df[column])

In [None]:
# outliers
df.plot(kind='box',figsize=(12,6))
plt.xticks(rotation=70);
plt.title('Outlier treated in df');

In [None]:
# dropped 'MaxTemp' because it highly correlated and 'Sunshine' because it has more than 48% values are missing

X = df[['MinTemp', 'Rainfall', 'Evaporation',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm','WindGustDir', 'WindDir9am', 'WindDir3pm','Location','RainToday']]

y = df.pop('RainTomorrow')


In [None]:
# Data split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=0,stratify=y)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.value_counts(1)

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Scaling
</p>
</div>

# Normalize data
#### <span style="font-family: Arial;font-size:1.2em;color:#0e92ea"> Scaling is process of standardize the independent features present in the data

In [None]:
# Scaling data
sc = MinMaxScaler()

x_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:250%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 15px;
          color:white;
          text-align:center;">Models
</p>
</div>

# Models

## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">i. Naive Bayes 

In [None]:
NaiveBayse_model = GaussianNB()

NaiveBayse_model.fit(x_train, y_train)

y_predict_NaiveBayse = NaiveBayse_model.predict(x_test)

NaiveBayse_model_score = NaiveBayse_model.score(x_test, y_test)
NaiveBayse_model_score_train = NaiveBayse_model.score(x_train, y_train)

print('Accuracy on Test set',NaiveBayse_model_score)
print('Accuracy on Train set',NaiveBayse_model_score_train)

print(confusion_matrix(y_test, y_predict_NaiveBayse))



In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 4))

sns.set_style('dark')
sns.set_context(context = 'notebook',font_scale=1)
plot_confusion_matrix(NaiveBayse_model,x_test,y_test,cmap='Blues',normalize='true',ax = ax[0]);
plot_confusion_matrix(NaiveBayse_model,x_train,y_train,cmap='Blues',normalize='true',ax = ax[1]);

ax[0].title.set_text('NaiveBayse on Test');
ax[1].title.set_text('NaiveBayse on Train');
plt.grid(False)
plt.tight_layout();

In [None]:
# probability
NB_probs_train = NaiveBayse_model.predict_proba(x_train)
NB_probs_train = NB_probs_train[:, 1]

NB_probs_test = NaiveBayse_model.predict_proba(x_test)
NB_probs_test = NB_probs_test[:, 1]

# plot
plt.style.use('seaborn-whitegrid')
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, NB_probs_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, NB_probs_test)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot([0, 1], [0, 1], linestyle='--');
ax1.plot(fpr_train, tpr_train, marker='.');
ax1.set_title('ROC_Curve on training set for Naive Bayse');

ax2.plot([0, 1], [0, 1], linestyle='--');
ax2.plot(fpr_test, tpr_test, marker='.');
ax2.set_title('ROC_Curve on test set for Naive Bayse');

auc_NaiveBayse_test = auc(fpr_test, tpr_test)
print('AUC on test set',auc_NaiveBayse_test)

## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">ii. LogisticRegression

In [None]:
LR_model = LogisticRegression(max_iter=1000)
LR_model.fit(x_train, y_train)

y_predict_LR = LR_model.predict(x_test)

LR_model_score = LR_model.score(x_test, y_test)
LR_model_score_train = LR_model.score(x_train, y_train)

print('Accuracy on Test set',LR_model_score)
print('Accuracy on Train set',LR_model_score_train)
print(confusion_matrix(y_test, y_predict_LR))

In [None]:
# probability
probs_train = LR_model.predict_proba(x_train)
probs_train = probs_train[:, 1]

probs_test = LR_model.predict_proba(x_test)
probs_test = probs_test[:, 1]

# plot
plt.style.use('seaborn-whitegrid')
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, probs_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, probs_test)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot([0, 1], [0, 1], linestyle='--');
ax1.plot(fpr_train, tpr_train, marker='.');
ax1.set_title('ROC_Curve on training set for logistic Regression');

ax2.plot([0, 1], [0, 1], linestyle='--');
ax2.plot(fpr_test, tpr_test, marker='.');
ax2.set_title('ROC_Curve on test set for logistic Regression');

auc_NaiveBayse_test = auc(fpr_test, tpr_test)
print('AUC on test set',auc_NaiveBayse_test)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 4))

sns.set_style('dark')
sns.set_context(context = 'notebook',font_scale=1)
plot_confusion_matrix(LR_model,x_test,y_test,cmap='Blues',normalize='true',ax = ax[0]);
plot_confusion_matrix(LR_model,x_train,y_train,cmap='Blues',normalize='true',ax = ax[1]);

ax[0].title.set_text('logistic Regression on Test');
ax[1].title.set_text('logistic Regression on Train');
plt.grid(False)
plt.tight_layout();

## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">iii. Linear Discriminant Analysis

In [None]:
LDA_model = LinearDiscriminantAnalysis()
LDA_model.fit(x_train, y_train)

y_pred_LDA = LDA_model.predict(x_test)
LDA_model_score = LDA_model.score(x_test, y_test)
LDA_model_score_train = LDA_model.score(x_train,y_train)

print('Accuracy on Test set',LDA_model_score)
print('Accuracy on Train set',LDA_model_score_train)
print(confusion_matrix(y_test, y_pred_LDA))

In [None]:
# probability
probs_train = LDA_model.predict_proba(x_train)
probs_train = probs_train[:, 1]

probs_test = LDA_model.predict_proba(x_test)
probs_test = probs_test[:, 1]

# plot
plt.style.use('seaborn-whitegrid')
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, probs_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, probs_test)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot([0, 1], [0, 1], linestyle='--');
ax1.plot(fpr_train, tpr_train, marker='.');
ax1.set_title('ROC_Curve on training set for LDA_model');

ax2.plot([0, 1], [0, 1], linestyle='--');
ax2.plot(fpr_test, tpr_test, marker='.');
ax2.set_title('ROC_Curve on test set for LDA_model');

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 4))

sns.set_style('dark')
sns.set_context(context = 'notebook',font_scale=1)
plot_confusion_matrix(LDA_model,x_test,y_test,cmap='Blues',normalize='true',ax = ax[0]);
plot_confusion_matrix(LDA_model,x_train,y_train,cmap='Blues',normalize='true',ax = ax[1]);

ax[0].title.set_text('LDA_model on Test');
ax[1].title.set_text('LDA_model on Train');
plt.grid(False)
plt.tight_layout();

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 20px;
          color:white;
          text-align:center;">Decision Tree based models
</p>
</div>

# Decision Tree based models

## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">i. Random Forest

In [None]:
RF_model = RandomForestClassifier(random_state=0,max_depth= 10, max_features= 5,min_samples_leaf= 30, min_samples_split= 100, n_estimators= 500)

RF_model.fit(x_train,y_train)

y_pred_RF =RF_model.predict(x_test)

model_score_RF = RF_model.score(x_test, y_test)
model_score_RF_train = RF_model.score(x_train, y_train)

print('Accuracy on Test set',model_score_RF)
print('Accuracy on Train set',model_score_RF_train)
print(confusion_matrix(y_test,y_pred_RF))

In [None]:
# probability
probs_train = RF_model.predict_proba(x_train)
probs_train = probs_train[:, 1]

probs_test = RF_model.predict_proba(x_test)
probs_test = probs_test[:, 1]

# plot
plt.style.use('seaborn-whitegrid')
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, probs_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, probs_test)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot([0, 1], [0, 1], linestyle='--');
ax1.plot(fpr_train, tpr_train, marker='.');
ax1.set_title('ROC_Curve on training set for Random Forest');

ax2.plot([0, 1], [0, 1], linestyle='--');
ax2.plot(fpr_test, tpr_test, marker='.');
ax2.set_title('ROC_Curve on test set for Random Forest');

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 4))


sns.set_style('dark')
sns.set_context(context = 'notebook',font_scale=1)
plot_confusion_matrix(RF_model,x_test,y_test,cmap='Blues',normalize='true',ax = ax[0]);
plot_confusion_matrix(RF_model,x_train,y_train,cmap='Blues',normalize='true',ax = ax[1]);
plt.grid(False)
ax[0].title.set_text('Random Forest on Test');
ax[1].title.set_text('Random Forest on Train');

plt.tight_layout();

## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">ii. Catboost

In [None]:
M_pool = Pool(data=x_train,label = y_train)

model_cb = CatBoostClassifier(verbose=False,iterations=500,learning_rate=0.01)
model_cb.fit(M_pool, plot=False,silent=True);

model_cb_score = model_cb.score(x_test, y_test)
model_cb_score_train = model_cb.score(x_train, y_train)

print('Accuracy on Test set',model_cb_score )
print('Accuracy on Train set',model_cb_score_train)

In [None]:
# probability
probs_train = model_cb.predict_proba(x_train)
probs_train = probs_train[:, 1]

probs_test = model_cb.predict_proba(x_test)
probs_test = probs_test[:, 1]

# plot
plt.style.use('seaborn-whitegrid')
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, probs_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, probs_test)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot([0, 1], [0, 1], linestyle='--');
ax1.plot(fpr_train, tpr_train, marker='.');
ax1.set_title('ROC_Curve on training set for Catboost');

ax2.plot([0, 1], [0, 1], linestyle='--');
ax2.plot(fpr_test, tpr_test, marker='.');
ax2.set_title('ROC_Curve on test set for Catboost');
    

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 4))


sns.set_style('dark')
sns.set_context(context = 'notebook',font_scale=1)
plot_confusion_matrix(model_cb,x_test,y_test,cmap='Blues',normalize='true',ax = ax[0]);
plot_confusion_matrix(model_cb,x_train,y_train,cmap='Blues',normalize='true',ax = ax[1]);
plt.grid(False)
ax[0].title.set_text('Catboost on Test');
ax[1].title.set_text('Catboost on Train');

plt.tight_layout();

<div style="color:white;
           display:fill;
           border-radius:15px;
           background-color:#189ad3;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.5px">

<p style="padding: 20px;
          color:white;
          text-align:center;">Artificial neural network ANN
</p>
</div>

## <span style="font-family: Arial;font-size:1.2em;color:#0e92ea">i. Mlpclassifier
#### <span style="font-family: Arial;font-size:1.2em;color:#333333">A multilayer perceptron (MLP) is a class of feedforward artificial neural network

In [None]:
# mlpclassifier
mlpcl = MLPClassifier(hidden_layer_sizes =484,max_iter=5000,
                   solver='adam',verbose=False, random_state=1,tol=0.001)

# fit the model
mlpcl.fit(x_train,y_train)

In [None]:
mlpcl_train_predict = mlpcl.predict(x_train)
mlpcl_test_predict = mlpcl.predict(x_test)

mlp_model_score = mlpcl.score(x_test, y_test)
mlp_model_score_train = mlpcl.score(x_train, y_train)

print('Accuracy on Test set',mlp_model_score )
print('Accuracy on Train set',mlp_model_score_train)

In [None]:

# probability
mlp_probs_train = mlpcl.predict_proba(x_train)
mlp_probs_train = mlp_probs_train[:, 1]

mlp_probs_test = mlpcl.predict_proba(x_test)
mlp_probs_test = mlp_probs_test[:, 1]

# plot
plt.style.use('seaborn-whitegrid')
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, mlp_probs_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, mlp_probs_test)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot([0, 1], [0, 1], linestyle='--');
ax1.plot(fpr_train, tpr_train, marker='.');
ax1.set_title('ROC_Curve on training set for MLP');

ax2.plot([0, 1], [0, 1], linestyle='--');
ax2.plot(fpr_test, tpr_test, marker='.');
ax2.set_title('ROC_Curve on test set for MLP');
    

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 4));


sns.set_style('dark');
sns.set_context(context = 'notebook',font_scale=1);
plt.grid(False)
plot_confusion_matrix(mlpcl,x_test,y_test,cmap='Blues',normalize='true',ax = ax[0]);
plot_confusion_matrix(mlpcl,x_train,y_train,cmap='Blues',normalize='true',ax = ax[1]);

ax[0].title.set_text('MLP on Test');
ax[1].title.set_text('MLP on Train');

plt.tight_layout();

# iii. ANN

In [None]:
# early stopping
early_stop = EarlyStopping(mode='max', verbose=1, patience=22)

# ANN
model =  Sequential()

# ANN layers
model.add(Dense(units=21,activation='relu'))

model.add(Dense(units=21,activation='relu'))

model.add(Dense(units=15,activation='relu'))

model.add(Dense(units=5,activation='relu'))

model.add(Dense(units=1,activation='sigmoid'))

# compile ANN
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Train ANN
model.fit(x=x_train, 
          y=y_train, 
          epochs=120,
          validation_data=(x_test, y_test), verbose=1,
          callbacks=[early_stop]
          )


In [None]:
# model history to df
loss_plot = pd.DataFrame(model.history.history)
accuracy_plot = pd.DataFrame(model.history.history)

#  accuracy and loss plot
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4))

ax1.plot(loss_plot.loc[:, ['loss']], label='Training loss');
ax1.plot(loss_plot.loc[:, ['val_loss']],label='Validation loss');
ax1.set_title('Training and Validation loss')
ax1.set_xlabel('epochs')
ax1.set_ylabel('Loss')
ax1.legend(loc="best");

ax2.plot(accuracy_plot.loc[:, ['accuracy']],label='Training_accuracy');
ax2.plot(accuracy_plot.loc[:, ['val_accuracy']], label='Validation_accuracy');
ax2.set_title('Training_and_Validation_accuracy');
ax2.set_xlabel('epochs')
ax2.set_ylabel('accuracy')
ax2.legend(loc="best");

In [None]:
# AUC area under the curve
y_pred_keras = model.predict(x_test).ravel()

fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)
auc_keras = auc(fpr_keras, tpr_keras)# area under the Curve

print("AUC on test set for ANN",auc_keras)

# models comparison

In [None]:
Accuracy_score = pd.DataFrame({'Models':['MLP','Catboost','Random Forest','LDA','Logistic Regression','Naive Bayse'],'Accuracy on Test set':[mlp_model_score,model_cb_score,model_score_RF,LDA_model_score,LR_model_score,NaiveBayse_model_score],'Accuracy on Training set':[mlp_model_score_train, model_cb_score_train, model_score_RF_train,LDA_model_score_train,LR_model_score_train,NaiveBayse_model_score_train],'AUC':[]})

Accuracy_score = Accuracy_score.sort_values('Accuracy on Test set',ascending=False)

In [None]:
Accuracy_score

## Thanks!!
## upvote if you like it and feel free to post any suggestions