# Importing Packages

In [None]:
%matplotlib inline

import pandas as pd
import missingno as mno
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Dataset analysis

### Reading CSV Dataset

In [None]:
df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")

### Checking first 5 rows

In [None]:
df.head()

## Checking shape of Dataframe

In [None]:
df.shape

# Variable Identification

## Checking Dataframe Info

In [None]:
df.info()

There are 16 *float64* column (numerical) and 7 *object* column (categorical).



## Checking null value count for each column

In [None]:
df.isnull().sum()

There are lot of null values in each column, except for date and location.

## Visualizing Null values using missingno

In [None]:
mno.matrix(df, figsize = (15, 6))

It can be visually seen that Evaporation, Sunshine, Cloud9am and Cloud3pm has lot of missing values.

# Data Preprocessing

## Filtering numerical and categorical columns

In [None]:
numerical_df_cols = df.columns[df.dtypes != object]

In [None]:
numerical_df_cols

In [None]:
categorical_df_cols = df.columns[df.dtypes == object]

In [None]:
categorical_df_cols

## Filetring Null value column dataframe

In [None]:
null_cols_df =  df[df.columns[df.isnull().any()]].copy()

In [None]:
null_cols_df

## Data Preprocessing for Numberical Columns

### Checking if Central tendencies can be used for imputing missing values.

In [None]:
def draw_histplot_between_num_features():
    fig, axes = plt.subplots(4, len(numerical_df_cols) // 4, figsize=(50, 50))
    row_idx = 0
    col_idx = 0
    for col in numerical_df_cols:
        if col_idx > 1 and col_idx % 3 == 1:
            row_idx += 1
            col_idx = 0
        sns.histplot(x=col, data=df, ax = axes[row_idx, col_idx])
        col_idx += 1
    plt.show()

In [None]:
draw_histplot_between_num_features()

Though most of the features seems to follow normal distribution, we cannot use mean to impute missing value because outliers. We can verify outliers using Boxplot

### Boxplot to visualize outlier

In [None]:
def draw_boxplot_for_num_features():
    fig, axes = plt.subplots(4, len(numerical_df_cols) // 4, figsize=(50, 50))
    row_idx = 0
    col_idx = 0
    for col in numerical_df_cols:
        if col_idx > 1 and col_idx % 3 == 1:
            row_idx += 1
            col_idx = 0
        sns.boxplot(x=col, data=df, ax = axes[row_idx, col_idx])
        col_idx += 1
    plt.show()

In [None]:
draw_boxplot_for_num_features()

It can be seen that most outliers are in **Rainfall** and **Evaporation**.

### Scatter plot between numerical features

In [None]:
sns.pairplot(df[numerical_df_cols])

It can be seen that mostly all features all follow linear relationship with other features. So we can try using Linear Regression to impute missing value.

#### Numerical Null value columns

In [None]:
numberical_null_value_cols = null_cols_df.columns[null_cols_df.dtypes != object]

In [None]:
numberical_null_value_cols

## Regression to impute missing value

Our dataset has lot of missing value, even dependent variable will also have missing values, So we can't use them directly. 
* First we have to create a copy of all numberical missing column.
* Then we can fill some random values using Simple Random Imputation. *(Simple Random Imputation)* Check [this](#Filling-NaN-with-Simple-Random-Imputation) section.
* Then we will use the regression to fill the value in actual columns, one by one. *(Deterministic Regression Imputation)*. Check [this](#Applying-Deterministic-Regression-Imputation) section.

**Before using Regression, we have to find the higly correlated column for each feature. So that we use only higly correlated column to impute the values.**

### Simple Random Imputation

It is used to fill missing values in dependent variable,so that we can use regression models. We will suffix the column with *'_imp'*

In [None]:
def simple_random_imputation(df):
    dataset = df.copy()
    for feature in numberical_null_value_cols:
        dataset[feature] = dataset[feature]
        number_missing = dataset[feature].isnull().sum()
        observed_values = dataset.loc[dataset[feature].notnull(), feature]
        dataset.loc[df[feature].isnull(), feature] = np.random.choice(observed_values, number_missing, replace = True)
            
    return dataset
       

### Deterministic Regression Imputation

It is used to replace the missing value with exact regression output without considering the error term. It may result in overfitting because error term is not considered. **We will prefix the column with 'det_' imputed with this regression**. Later in this notebook we will implement stochastic regression imputation that will overcome the issue of Deterministic Regression Imputation.

In [None]:
def deterministic_regression_imputation(df, feature, correlated_cols):
    dataset = df.copy()
    dataset["det_" + feature] = df_imp[feature]
    parameters = list(correlated_cols) 

    #linear Regression model to estimate the missing data
    model = LinearRegression()
    model.fit(X = df_imp[parameters], y = df_imp[feature])
    dataset.loc[dataset[feature].isnull(), "det_" + feature] = model.predict(df_imp[parameters])[dataset[feature].isnull()]
    
    dataset[feature] = dataset["det_" + feature]
    dataset.drop("det_" + feature, axis = 1, inplace = True)
    return dataset

        

#### Filling NaN with Simple Random Imputation

In [None]:
df_imp = simple_random_imputation(df)

#### Checking newly added Columns

In [None]:
df_imp.columns

In [None]:
df_imp.isnull().sum()

## Applying Deterministic Regression Imputation

* First we will find out higly positively and negatively correlated columns.
* Use those copy of correlated columns that we created in [simple random imputation](#Filling-NaN-with-Simple-Random-Imputation) for Regression Imputation.

In [None]:
def find_correlated_cols(df):
    dataset = df.copy()
    corr = dataset.corr()
    corr_col_arr = []
    for col in numberical_null_value_cols:
        correlated_cols = []

        # find correlated columns
        for rel_col, rel_col_corr in corr[col].iteritems():
            if abs(rel_col_corr) >= 0.2 and abs(rel_col_corr) <= 0.9 and "_imp" not in rel_col  and "det_" not in rel_col:
                correlated_cols.append(rel_col)
        corr_col_arr.append({'col':col, 'correlated_cols': correlated_cols})
    return corr_col_arr
    

In [None]:
corr_col_arr = find_correlated_cols(df_imp)

In [None]:
for v in corr_col_arr:
    print("Column:", v['col'])
    print("Correlated Column:", v['correlated_cols'])
    print("\n")


In [None]:
def draw_scatter_plot(x, correlated_cols, df):
    fig, axes = plt.subplots(1, len(correlated_cols), figsize=(50, 8))
    print("\n") 
    fig.suptitle(x + ' vs Correlated Columns:' + ', '.join(correlated_cols))
    for idx, rel_col in enumerate(correlated_cols):
        sns.scatterplot(x = x, y = rel_col, data= df, ax = axes[idx])
    plt.show()

In [None]:
def draw_scatter_plot_between_correlated_feature(df, corr_col_arr):
    dataset = df.copy()
    for v in corr_col_arr:
        draw_scatter_plot(v['col'], v['correlated_cols'], dataset)
        

In [None]:
draw_scatter_plot_between_correlated_feature(df_imp, corr_col_arr)

In [None]:
def appy_deterministic_imp(df):
    dataset = df.copy()
    for v in corr_col_arr:
        print("Column:", v['col'])
        print("Correlated Column:",v['correlated_cols'])
        if len(v['correlated_cols']) > 0: 
            dataset = deterministic_regression_imputation(dataset, v['col'], v['correlated_cols'])
            print(dataset[v['col']].isnull().sum())
        else:
            print("No Correlated Column for", v['col'])
        print("\n")
    return dataset

In [None]:
df_det = appy_deterministic_imp(df)

In [None]:
df_det.columns

## Data Preprocessing for Categorical Columns

### Filtering Categorical Null value columns

In [None]:
categorical_null_value_cols = null_cols_df.columns[null_cols_df.dtypes == object]

In [None]:
categorical_null_value_cols

In [None]:
for col in categorical_null_value_cols:
    print("Unique value of column:", col)
    print(df[col].unique())
    print("\n")

### Imputing Categotical value

We are using mode of a particular location to impute missing values.

In [None]:
def impute_categorical_variable(df, cat_features):
    for cat_feature in cat_features:
        print("Imputing Column:", cat_feature)
        df[cat_feature] = df[cat_feature].fillna((df.groupby('Location')[cat_feature].transform(lambda x:  next(iter(x.mode()), np.nan))))
        print(df[[cat_feature]].isnull().sum())
        print("\n")

In [None]:
impute_categorical_variable(df_det, categorical_null_value_cols)

It can be seen that there are still some missing values in WindGustDir. It means some location have no value at all. We wll using mode of complete dataset.

In [None]:
df_det['WindGustDir']=df_det['WindGustDir'].fillna(df_det['WindGustDir'].mode().max())

In [None]:
df_det['WindGustDir'].isnull().sum()

### Encode Categorical Value

First we will encode RainToday and RainTomorrow Column. "Yes" => 1 and "No" => 0

#### Encoding Column: RainToday 

In [None]:
df_det['RainToday'] = df_det['RainToday'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
df_det['RainToday'].value_counts()

#### Encoding Column: RainTomorrow

In [None]:
df_det['RainTomorrow'] = df_det['RainTomorrow'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
df_det['RainTomorrow'].value_counts()

### Encoding using get_dummies

In [None]:
df_det = pd.get_dummies(data=df_det, columns=['WindGustDir','WindDir9am','WindDir3pm'])

In [None]:
df_det.columns

In [None]:
df_det.isnull().sum()

### Scaling using Standard scaler

In [None]:

from sklearn.preprocessing import StandardScaler

def scale_dataset(df):
    dataset = df.copy()
    dataset.drop(['Date', 'Location', 'RainToday', 'RainTomorrow'], axis = 1, inplace = True)
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(dataset),columns = dataset.columns)

In [None]:
df_det_scaled = pd.merge(scale_dataset(df_det), df_det[['Date', 'Location', 'RainToday', 'RainTomorrow']],left_index=True, right_index=True )

In [None]:
df_det_scaled.columns

In [None]:
df_det_scaled.head()

# Exploratory Data Analysis

## Highest and Lowest MinTemp by Location

In [None]:
gp_min_temp = df_det.groupby('Location')['MinTemp'].agg(['min', 'max']).reset_index()

In [None]:
gp_min_temp.sort_values('min').head(3)

MonutGinni has the lowest MinTemp.

In [None]:
gp_min_temp.sort_values('max', ascending=False).head(3)

Adelaide has the highest MinTemp 

### Highest and Lowest MaxTemp by Location

In [None]:
gp_max_temp = df_det.groupby('Location')['MaxTemp'].agg(['min', 'max']).reset_index()

In [None]:
gp_max_temp.sort_values('min').head(3)

MountGinini has the lowest MaxTemp

In [None]:
gp_max_temp.sort_values('max', ascending=False).head(3)

Woomera has the highest MaxTemp.

In [None]:
corr = df_det_scaled.corr().round(2)

In [None]:
corr

In [None]:
plt.figure(figsize=(40,30))
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True)

- **det_Temp9am and det_Temp3pm has high correlation with det_MinTemp and det_MaxTemp. So we will drop those column. To avoid overfitting**

In [None]:
def drop_column_to_avoid_overfitting(df):
    dataset = df.copy()
    dataset.drop('Temp9am',axis=1,inplace=True)
    dataset.drop('Temp3pm',axis=1,inplace=True)
    return dataset

In [None]:
df_det_scaled_new = drop_column_to_avoid_overfitting(df_det_scaled)

#### Column with Zero Correlation

In [None]:
zreo_corr_col =  corr[corr['RainTomorrow'] == 0].index

In [None]:
zreo_corr_col

#### Dropping column with Zero correlation

In [None]:
df_det_scaled_new.drop(zreo_corr_col, axis = 1, inplace = True)


In [None]:
df_det_scaled_new.columns

In [None]:
total_dp = len(df['RainTomorrow'])
df_det_scaled_new.groupby('RainTomorrow')['RainTomorrow'].count().apply( lambda x: (x/total_dp) * 100  )

Dataset is higly imbalance. 78% contains No, and 22% contains Yes.

# Feature Selection

We will use lasso regression for feature selection.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
dataset = df_det_scaled_new.copy()
dataset.drop(['Date', 'Location'], axis = 1, inplace =True)
Y_det = dataset['RainTomorrow']
X_det = dataset.drop('RainTomorrow', axis = 1)

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state = 0))
feature_sel_model.fit(X_det, Y_det)

In [None]:
feature_sel_model.get_support()

In [None]:
selected_feat = X_det.columns[(feature_sel_model.get_support())]

In [None]:
selected_feat

# Model Selection and Cross Validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [None]:
def apply_stratified_k_fold_validation( model, x = df_det_scaled_new[selected_feat], y = Y_det ):
    stratified_acc = []
    for train_index, test_index in skf.split(x , y):
        x_train_fold, x_test_fold = x.iloc[train_index.tolist()], x.iloc[test_index.tolist()]
        y_train_fold, y_test_fold = y.iloc[train_index.tolist()], y.iloc[test_index.tolist()]
        model.fit(x_train_fold, y_train_fold)
        stratified_acc.append(model.score(x_test_fold, y_test_fold))
    
    print("\n")
    print('List of possible accuracy:', stratified_acc)
    
    print("\n")
    print('Maximum Accuracy That can be obtained from this model is:', max(stratified_acc)*100, '%')
    
    print("\n")
    print('Minimum Accuracy:', min(stratified_acc)*100, '%')
    
    print("\n")
    print('Overall Accuracy:', np.mean(stratified_acc)*100, '%')
    
    print("\n")
    print('Standard Deviation is:', np.std(stratified_acc)*100, '%')

## Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
apply_stratified_k_fold_validation(model)

## k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)    
apply_stratified_k_fold_validation(model)

## Decision Trees

In [None]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
apply_stratified_k_fold_validation(model)

## XBoost Classifier

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=12,random_state = 42, use_label_encoder =False)
apply_stratified_k_fold_validation(model)

## Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
apply_stratified_k_fold_validation(model)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
apply_stratified_k_fold_validation(model)

## **Best Models are Logistic Regression, XGBoost and Random forest with Maximum Accuracy of 85%.**