# Import Libraries

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

# Problem Statement

Predict whether it is going to rain tomorrow or not based on todays's weather report

# Import Dataset

In [None]:
raw_df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
raw_df

In [None]:
raw_df.info()

Total Columns = 23<br>
Date is irrelevant feature<br>
RainTomorrow is taget feature<br>
So, 21 features and one taget feature<br>
In the dataset, there are missing values for some rows which can be preprocessed.<br>

<b>Note :</b>There are also missing values in the target column 'RainTomorrow', <br>
so those rows for which there are null values in target column 'RainTomorrow' are to be removed.<br>

Also, the feature 'RainToday' is a feature is likely to be very closely related to the target variable.<br>
So, we consider this hypothesis and remove rows with null values

In [None]:
raw_df.dropna(subset=['RainToday','RainTomorrow'], inplace=True)

In [None]:
raw_df.info()

The number of rows are reduced from 145460 to 140787 <br>
Also, no null values in the columns RainToday, RainTomorrow

# Data Visualization & Analysis

In [None]:
#configure matplotlib for visulaisation style
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

## Location Distribution

In [None]:
fig = px.histogram(raw_df, x='Location', title='Distribution of Location', marginal='box',color='RainToday' )
fig.update_layout(bargap=0.1)
fig.show()

Uniform Distribution<br>
Above 20% there were rain in all cities<br>
Nhil, Katherine, Uluru has lesser values than other cities.<br>
This might be probably due to no weather stations or data lost, or any other factors<br>

Therefore, Location is a factor for rainfall

## Minimum Temperature

In [None]:
fig = px.histogram(raw_df, x='MinTemp', title='Minimum Temperature Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the minimum temperature is, there is more possible of ***no rain*** on the next day.<br>
There is less possibility to rain on the next day if the minimum temperature on the previous day is around 5 to 25<br>

## Maximum Temperature

In [None]:
fig = px.histogram(raw_df, x='MaxTemp', title='Maximum Temperature Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the maximum temperature is, there is more possible of ***no rain*** on the next day.<br>
There are rain on the next day if the maximum temperature on the previous day is around 10 to 35 <br>

## Rainfall Distribution - Rain Tomorrow Vs Rain Today
We assumed a hypothesis that if there is rain on a day it is more likely to rain on the next day too<br>

In [None]:
fig = px.histogram(raw_df, x='RainTomorrow', title='Rain Tomorrow Vs Rain Today', marginal='box', color='RainToday')
fig.update_layout(bargap=0.1)
fig.show()

Out of 140787 sample cases,around 110k there were rain the next day<br>
but around 30k there were no rain the next day.<br>
This is class imbalance i.e, no imbalance of each classes of target variable which may lead to bias towards not raining on tomorrow.

There were 92.728k cases with no rain today and tomorrow<br>
There were 16.858k cases with rain today and no rain tomorrow<br>
There were 16.604k cases with no rain today and rain tomorrow<br>
There were 14.597k cases with rain today and tomorrow<br>

So, there is a high chance that it will not rain tomorrow if it didn't rain today<br>
But it is not the case that it will rain tomorrow if it did rain today<br>
i.e, there is equal chance of raining tomorrow if it either rains today or not.<br>

<b>So it is easy to predict rain tomorrow as No but not easy to predict rain tomorrow as Yes (when rain today is no)

## Evaporation Distribution

In [None]:
fig = px.histogram(raw_df, x='Evaporation', title='Evaporation Distribution', color='RainTomorrow', marginal='box')
#fig = px.histogram(raw_df[raw_df.Evaporation < 10], x='Evaporation', title='Evaporation Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.01)
fig.show()

Whatever the evaporation rate, there is more possibility of ***no rain*** on the next day<br>
Also most commonly evaporation rate in which there is a low probability to rain on the next day is in the range is 0.6 - 8.

## Sunshine Distribution

In [None]:
fig = px.histogram(raw_df, x='Sunshine', title='Sunshine Distribution', color='RainTomorrow', marginal='box')
#fig = px.histogram(raw_df[raw_df.Sunshine <8], x='Sunshine', title='Sunshine Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

From the histogram, as the luminuous intensity of sun increases it is more probably that it will ***not rain*** on the next day.<br>
And when the luminuous intensity of sun is low it is more probably ***to rain*** on the next day

## Wind Gust Direction
This is a categorical value with 16 categories of direction

In [None]:
fig = px.histogram(raw_df, x='WindGustDir', title='Wind Gust Direction Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the wind gust direction is, there is more possibility of ***no rain*** on the next day<br>
Also for every of the 16 directions, there is lower probability that there will be rain on the next day.

## Wind Gust Speed Distribution
A sudden burst in wind speed is called the wind gusts

In [None]:
fig = px.histogram(raw_df, x='WindGustSpeed', title='Wind Gust Speed Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the wind gust speed, there is more possibility of ***no rain*** on the next day<br>
Also most commonly wind gust speed is in the range is 24 - 65 for which there is low probability to rain on the next day.

## Wind Direction@9AM
This is a categorical value with 16 categories of direction

In [None]:
fig = px.histogram(raw_df, x='WindDir9am', title='Wind Direction@9am Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the wind gust direction is, there is more possibility of ***no rain*** on the next day<br>
Also for every of the 16 directions, there is lower probability that there will be rain on the next day.

## Wind Direction@3PM
This is a categorical value with 16 categories of direction

In [None]:
fig = px.histogram(raw_df, x='WindDir3pm', title='Wind Direction@3pm Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the wind gust direction is, there is more possibility of ***no rain*** on the next day<br>
Also for every of the 16 directions, there is lower probability that there will be rain on the next day.

## WindSpeed@9am Distribution

In [None]:
fig = px.histogram(raw_df, x='WindSpeed9am', title='WindSpeed@9am Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the wind speed at 9am, there is more possibility of ***no rain*** on the next day.<br>
Also most commonly wind speed at 9am is in the range is 0 - 28 for which there is low probability to rain on the next day

## WindSpeed@3PM Distribution

In [None]:
fig = px.histogram(raw_df, x='WindSpeed3pm', title='WindSpeed@3pm Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the wind speed at 3pm, there is more possibility of ***no rain*** on the next day.<br>
Also most commonly wind speed at 3pm is in the range is 7 - 31 for which there is low probability to rain on the next day

## Humidity@9AM Distribution

In [None]:
fig = px.histogram(raw_df, x='Humidity9am', title='Humidity@9am Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the humidity at 9am, there is more possibility of ***no rain*** on the next day.<br>
Also most commonly humidity at 9am is in the range is 60 - 100 for which there is low probability to rain on the next day

## Humidity@3PM Distribution

In [None]:
fig = px.histogram(raw_df, x='Humidity3pm', title='Humidity@3pm Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whenever the humidity at 3pm is in the range 0 - 78, there is more possibility of ***no rain*** on the next day.<br>
Whenever the humidity at 3pm is above 78, there is more possibility of ***raining*** on the next day.<br>

## Pressure@9AM Distribution

In [None]:
fig = px.histogram(raw_df, x='Pressure9am', title='Pressure@9am Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the pressure at 9am, there is more possibility of ***no rain*** on the next day.<br>
Also most commonly pressure at 9am is in the range is 1000 - 1030 for which there is low probability to rain on the next day

## Pressure@3PM Distribution

In [None]:
fig = px.histogram(raw_df, x='Pressure3pm', title='Pressure@3pm Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the pressure at 3pm, there is more possibility of ***no rain*** on the next day.<br>
Also most commonly pressure at 3pm is in the range is 1000 - 1030 for which there is low probability to rain on the next day

## Cloud@9AM Distribution

In [None]:
fig = px.histogram(raw_df, x='Cloud9am', title='Cloud@9am Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the cloud at 9am, there is more possibility of ***no rain*** on the next day.<br>
As cloud at 9am(6-8) increases probability of raining on the next day increases

## Cloud@3PM Distribution

In [None]:
fig = px.histogram(raw_df, x='Cloud3pm', title='Cloud@3pm Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the cloud at 3pm, there is more possibility of ***no rain*** on the next day.<br>
As cloud at 3pm increases probability of raining on the next day increases.<br>
For cloud at 3pm = 8, there is more probability ofraining on the next day

## Temperature@9AM Distribution

In [None]:
fig = px.histogram(raw_df, x='Temp9am', title='Temperature@9AM Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Whatever the Temperature at 9am, there is more possibility of no rain on the next day.<br>
Also most commonly temperature at 9am is in the range is 10 - 30 for which there is low probability to rain on the next day.

## Temperature@3PM Distribution

In [None]:
fig = px.histogram(raw_df, x='Temp3pm', title='Temperature@3PM Distribution', color='RainTomorrow', marginal='box')
fig.update_layout(bargap=0.1)
fig.show()

Gaussian / Normal Distibution<br>
It seems for Lower the temperature, more cases of rain the next day<br>
Also there are some cases when temperature is high and there was rain the next day

## Min Temp Vs Max Temp

In [None]:
px.scatter(raw_df.sample(2000), x='MinTemp', y='MaxTemp', title='MinTemp Vs MaxTemp', color='RainToday')

***Note***<br>
For RainToday=Yes, Minimum temperature and maximum temperature are nearer.<br>
For RainToday=No, Minimum temperature and maximum temperature are not nearer.<br>

When there is rain, variation in temperature is small (i.e,maximum temperature is nearer to minimum temperature)

## Rainfall Vs Evaporation

In [None]:
px.scatter(raw_df.sample(2000), x='Rainfall', y='Evaporation', title='Rainfall Vs Evaporation', color='RainToday')

As rainfall increases, evaporation decreases<br>
When rainfall in a day is less than or equal to 1, there is no rain on the next day<br>
When rainfall in a day is more than 1, there is rain on the next day<br>

## Sunshine Vs Evaporation

In [None]:
px.scatter(raw_df.sample(2000), x='Sunshine', y='Evaporation', title='Sunshine Vs Evaporation', color='RainToday')

With increase in sunshine, the evaporation rate is almost constant<br>
With increase in sunshine, there is a more possibility to rain on the next day as<br>
the number of samples where it rained the next day increases with increase in sunshine<br>

## Temp@9AM Vs Humidity@9AM

In [None]:
px.strip(raw_df.sample(2000), x='Temp9am', y='Humidity9am', title='Temp@9AM Vs Humidity@9AM', color='RainTomorrow')

When temperature@9am increases Humidity@9am decreases for both rainTomorrow=Yes or No<br>
For higher temperature and lower humidity it is more probable that it will not rain the next day

## Temp@3PM Vs Humidity@3PM

In [None]:
px.strip(raw_df.sample(2000), x='Temp3pm', y='Humidity3pm', title='Temp@3PM Vs Humidity@3PM', color='RainTomorrow')

When temperature@3pm increases Humidity@3pm decreases for both rainTomorrow=Yes or No<br>
But during rainTomorrow=Yes, Humidity is more compared to rainTomorrow=No<br>

When temperature@3pm is low & humidity@3pm is high, there is high chance that it will rain tomorrow

## Pressure@9AM Vs Humidity@9AM

In [None]:
px.strip(raw_df.sample(2000), x='Pressure9am', y='Humidity9am', title='Pressure@9AM Vs Humidity@9AM', color='RainTomorrow')

When Pressure@9am increases Humidity@9am increases<br>
Most samples for which there will be no rain the next day have humidity > 20<br>
Most samples for which there will be no rain the next day have humidity > 50<br>

## Pressure@3PM Vs Humidity@3PM

In [None]:
px.strip(raw_df.sample(2000), x='Pressure3pm', y='Humidity3pm', title='Pressure@3PM Vs Humidity@3PM', color='RainTomorrow')

Most of the samples for which the next day will rain have humidity > 30<br>
Most of the samples for which the next day will not rain have humidity > 0<br>

## Temp@9AM Vs Pressure@9AM

In [None]:
px.strip(raw_df.sample(2000), x='Temp9am', y='Pressure9am', title='Temp@9AM Vs Pressure@9AM', color='RainTomorrow')

With increase in temperature@9am pressure@9am decreases slightly<br>
For less temperature and high pressure, it is more probable that it will not rain the next day

## Temp@3PM Vs Pressure@3PM

In [None]:
px.strip(raw_df.sample(2000), x='Temp3pm', y='Pressure3pm', title='Temp@3PM Vs Pressure@3PM', color='RainTomorrow')

With increase in temperature@3pm pressure@3pm decreases slightly<br>
For lesser temperature and pressure it is more likely to rain the next day<br>
For higher temperature and lower pressure it is not likely to rain on the next day

# Working With Sample

In [None]:
use_sample = False
sample_fraction = 0.1
if use_sample:
    raw_df = raw_df.sample(sample_fraction).copy()

# Training, Validation, Test Sets

In [None]:
train_val_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

In [None]:
print('train_df shape :',train_df.shape)
print('val_df shape :',val_df.shape)
print('test_df shape :',test_df.shape)

***Note :*** While working with dates (time-series data), it's often a better idea to separate the training, validation and test sets<br> with time, so that the model is trained on data from the past and evaluated on data from the future.<br>

Let us analyse the years from which the dataset are from.

In [None]:
plt.title('No. of Rows per year')
sns.countplot(x=pd.to_datetime(raw_df.Date).dt.year);

We'll use datas till 2014 (inclusive) for training, 2015 for validation and 2016 & 2017 for testing<br>
even there is no 60-20-20 ratio maintained.

Data from 2018, 2019 can be used for deployment (future).

In [None]:
year = pd.to_datetime(raw_df.Date).dt.year
year

In [None]:
train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

In [None]:
print('train_df shape :',train_df.shape)
print('val_df shape :',val_df.shape)
print('test_df shape :',test_df.shape)

# Identify Input & Target Columns

In [None]:
raw_df

***Note :***
The date column is 2018 everywhere due to the split we have done. So, during testing our model won't<br> see any date as 2018 so it is useless to use date column as an input column.<br>

RainTomorrow is target.<br>
All other than these are input columns.

In [None]:
cols = list(train_df.columns)
input_cols = cols[1:-1]
target_cols = cols[-1]

#### Training Set

In [None]:
X_train = train_df[input_cols].copy()
Y_train = train_df[target_cols].copy()

#### Validation Set

In [None]:
X_val = val_df[input_cols].copy()
Y_val = val_df[target_cols].copy()

#### Test Set

In [None]:
X_test = test_df[input_cols].copy()
Y_test = test_df[target_cols].copy()

#### Identify Numerical & Categorical Columns

In [None]:
numeric_cols = list(X_train.select_dtypes(include=np.number).columns)
categorical_cols = list(X_train.select_dtypes('object').columns)

In [None]:
numeric_cols,categorical_cols

#### Statistics of Numerical Data in Training Set

In [None]:
X_train.describe()

#### Unique Categories of Categorical Data in Training Set

In [None]:
X_train[categorical_cols].nunique()

# Imputing Missing Numeric Data

Since there are some missing values, we'll get error when performing some steps in ML.

<img src="https://i.imgur.com/W7cfyOp.png" width="480">

So, we'll replace those values with average value from the column.<br>

We'll compute average from the entire set and fill it in train, val, test set individually

#### Find Total Number of Nan in Numeric Coumns

In [None]:
raw_df[numeric_cols].isna().sum()

In [None]:
imputer = SimpleImputer(strategy='mean')

In [None]:
imputer.fit(raw_df[numeric_cols])   #fitting in raw_df not in X_train, X_val 

In [None]:
imputer.statistics_

In [None]:
X_train[numeric_cols] = imputer.transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

Values Replaced by Average 

In [None]:
X_train[numeric_cols].isna().sum()

> **EXERCISE**: Apply some other imputation techniques and observe how they change the results of the model. You can learn more about other imputation techniques here: https://scikit-learn.org/stable/modules/impute.html

# Feature Scaling

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(raw_df[numeric_cols])  #not splitted data set

#### Scale Training Data Set

In [None]:
X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_train[numeric_cols]

In [None]:
X_train[numeric_cols].describe()

#### Scaling Validation Data Set

In [None]:
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
X_val

#### Scaling Test Data Set

In [None]:
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
X_test

# Encoding Categorical Values

Since machine learning models can only be trained with numeric data, we need to convert categorical data to numbers by using techniques like one-hot encoding for categorical columns.

<img src="https://i.imgur.com/n8GuiOO.png" width="640">

One hot encoding involves adding a new binary (0/1) column for each unique category of a categorical column. 

In [None]:
raw_df[categorical_cols].nunique()

#### Encoding Location Column

In [None]:
raw_df.Location.unique()

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
encoder.fit(raw_df[categorical_cols])    #fit with entire dataset

In [None]:
categorical_cols

In [None]:
encoder.categories_

#### Get Feature Names for the Encoded Columns

In [None]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))
encoded_cols

#### Training Data Set

In [None]:
X_train[encoded_cols] = encoder.transform(X_train[categorical_cols])

#### Validation Data Set

In [None]:
X_val[encoded_cols] = encoder.transform(X_val[categorical_cols])

#### Test Data Set

In [None]:
X_test[encoded_cols] = encoder.transform(X_test[categorical_cols])

***Note :***<br>
In these data frames, both categorical columns and corresponding encoded columns are also present

# Save Processed Data

In [None]:
print('X_train:', X_train.shape)
print('Y_train:', Y_train.shape)
print('X_val:', X_val.shape)
print('Y_val:', Y_val.shape)
print('X_test:', X_test.shape)
print('Y_test:', Y_test.shape)

#### Store DataFrames as parquet format

In [None]:
X_train.to_parquet('X_train.parquet')
X_val.to_parquet('X_val.parquet')
X_test.to_parquet('X_test.parquet')

In [None]:
pd.DataFrame(Y_train).to_parquet('Y_train.parquet')
pd.DataFrame(Y_val).to_parquet('Y_val.parquet')
pd.DataFrame(Y_test).to_parquet('Y_test.parquet')

#### Read those stored data

In [None]:
X_train = pd.read_parquet('X_train.parquet')
X_val = pd.read_parquet('X_val.parquet')
X_test = pd.read_parquet('X_test.parquet')

In [None]:
Y_train = pd.read_parquet('Y_train.parquet')[target_cols]
Y_val = pd.read_parquet('Y_val.parquet')[target_cols]
Y_test = pd.read_parquet('Y_test.parquet')[target_cols]

In [None]:
print('X_train:', X_train.shape)
print('Y_train:', Y_train.shape)
print('X_val:', X_val.shape)
print('Y_val:', Y_val.shape)
print('X_test:', X_test.shape)
print('Y_test:', Y_test.shape)

# Training a Logistic Regression Model

In [None]:
model = LogisticRegression(solver ='liblinear') #liblinear optimization

In [None]:
%%time
model.fit(X_train[numeric_cols + encoded_cols], Y_train)

In [None]:
weight_df = pd.DataFrame({
    'feature' : (numeric_cols + encoded_cols),
    'weight' : model.coef_.tolist()[0]
})

In [None]:
plt.figure(figsize=(10,50))
sns.barplot(data=weight_df, x='weight', y='feature');

## Top 10 Important Features

In [None]:
sns.barplot(data=weight_df.sort_values('weight', ascending=False).head(10), x='weight', y='feature');

# Making Predictions & Evaluations

In [None]:
X_train = X_train[numeric_cols + encoded_cols]
X_val = X_val[numeric_cols + encoded_cols]
X_test = X_test[numeric_cols + encoded_cols]

In [None]:
train_pred = model.predict(X_train)
train_probs = model.predict_proba(X_train)

In [None]:
train_pred

In [None]:
train_probs

In [None]:
Y_train

#### Accuracy

In [None]:
print('Accuracy =',accuracy_score(Y_train, train_pred))

The model achieves an accuracy of 85.1% on the training set. We can visualize the breakdown of correctly and incorrectly classified inputs using a confusion matrix.

<img src="https://i.imgur.com/UM28BCN.png" width="480">

#### Confusion Matrix - Training Data Set

In [None]:
cf = confusion_matrix(Y_train, train_pred, normalize='true')

In [None]:
print('Accuracy =',accuracy_score(Y_train, train_pred)*100)
plt.figure();
sns.heatmap(cf, annot=True);
plt.title('Training Confusion Matrix');
plt.xlabel('Prediction');
plt.ylabel('Target');

#### Confusion Matrix - Validation Data Set

In [None]:
X_val

In [None]:
val_pred = model.predict(X_val)
cf = confusion_matrix(Y_val, val_pred, normalize='true')
cf

In [None]:
print('Accuracy =',accuracy_score(Y_val, val_pred)*100)
plt.figure();
sns.heatmap(cf, annot=True);
plt.title('Validation Confusion Matrix');
plt.xlabel('Prediction');
plt.ylabel('Target');

#### Confusion Matrix - Test Data Set

In [None]:
test_pred = model.predict(X_test)
cf = confusion_matrix(Y_test, test_pred, normalize='true')

In [None]:
print('Accuracy =', accuracy_score(Y_test, test_pred))
plt.figure()
sns.heatmap(cf, annot = True);
plt.title('Testing Confusion Matrix');
plt.xlabel('Prediction');
plt.ylabel('Targer');

The accuracy of the model on the test and validation set are above 84%, which suggests that our model generalizes well to data it hasn't seen before. 

But how good is 84% accuracy? While this depends on the nature of the problem and on business requirements, a good way to verify whether a model has actually learned something useful is to compare its results to a "random" or "dumb" model.

Let's create two models: one that guesses randomly and another that always return "No". Both of these models completely ignore the inputs given to them.

In [None]:
def random_model(inputs):
    return np.random.choice(['No', 'Yes'], len(inputs))
print('Accuracy on Random Model on Validation Data Set =',accuracy_score(random_model(X_val), Y_val))

Therefore, just by predicting randomly accuracy is 50 %

In [None]:
def all_no(inputs):
    return np.full(len(inputs), ['No'])
print('Accuracy on All_No Model on Test Data Set =',accuracy_score(all_no(X_test), Y_test))

Therefore, just by predicting all as 'No' gives accuracy as 77 %.<br>
***Note : *** Accuracy for All_No model is 77% and our model has accuracy as 84% for test data set<br>

This is because the validation data set is skewed towards 'No' as see below:

In [None]:
Y_test.value_counts()

> **EXERCISE**: Initialize the `LogisticRegression` model with different arguments and try to achieve a higher accuracy. The arguments used for initializing the model are called hyperparameters (to differentiate them from weights and biases - parameters that are learned by the model during training). You can find the full list of arguments here: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html 

> **EXERCISE**: Train a logistic regression model using just the numeric columns from the dataset. Does it perform better or worse than the model trained above?

> **EXERCISE**: Train a logistic regression model using just the categorical columns from the dataset. Does it perform better or worse than the model trained above?

> **EXERCISE**: Train a logistic regression model without feature scaling. Also try a different strategy for missing data imputation. Does it perform better or worse than the model trained above?

# Making Prediction on Single Input

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Katherine',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

***Note : ***There is Nan value also

In [None]:
new_input = pd.DataFrame([new_input])

In [None]:
new_input

We must now apply the same transformations applied while training the model:

1. Imputation of missing values using the `imputer` created earlier (Average values for numeric columns only)
2. Scaling numerical features using the `scaler` created earlier
3. Encoding categorical features using the `encoder` created earlier

In [None]:
new_input[numeric_cols] = imputer.transform(new_input[numeric_cols])
new_input[numeric_cols] = scaler.transform(new_input[numeric_cols])
new_input[encoded_cols] = encoder.transform(new_input[categorical_cols])

In [None]:
X_inp = new_input[numeric_cols + encoded_cols]

In [None]:
print(model.predict(X_inp))

In [None]:
model.predict_proba(X_inp)

Seems our model is not  confident with its prediction

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

In [None]:
predict_input(new_input)

# Saving & Loading Model

We can save the parameters (weights and biases) of our trained model to disk, so that we needn't retrain the model from scratch each time we wish to use it. Along with the model, it's also important to save imputers, scalers, encoders and even column names. Anything that will be required while generating predictions using the model should be saved.

We can use the `joblib` module to save and load Python objects on the disk. 

Let's first create a dictionary containing all the required objects.

In [None]:
aussie_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_cols,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

#### Save All

In [None]:
joblib.dump(aussie_rain,'aussie_rain.joblib')

#### Load

In [None]:
aussie_rain = joblib.load('aussie_rain.joblib')

In [None]:
aussie_rain

In [None]:
accuracy_score(aussie_rain['model'].predict(X_test), Y_test)