## Table of Contents

1. [Overview of loaded data](#1)  
    1.1. shape, head, info, describe  
    1.2. column composition  
    1.3. Data types of dataset  
    1.4. Categorical column, numerical column  
    1.5. Mean and variance pattern of numerical type data on normal distribution  
    1.6. Check NaNs
1. [Check the class imbalance](#2)
1. [Check the cardinality of categorical columns](#3)
1. [Process date data](#4)
1. [Check and manage outliers from numerical data](#5)
1. [Check correlation of features](#6)
1. [Compose dataset for machine learning model](#7)
1. [Make 5 folds stratified cross validation loop](#8)  
    8.1. Resample (oversampling) on train dataset  
    8.2. Manage null values  
    8.3. Impute missing categorical variables with most frequent value  
    8.4. One hot encoding  
    8.5. Scaling (min max scaling)  
    8.6. Train logistic classification model  
    8.7. Measure performance of prediction on validation dataset  
1. [Train validated model with full train dataset](#9)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = '/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv'
df = pd.read_csv(data)

## 1. Overview of loaded data <a class="anchor" id="1"></a>

In [None]:
display(df.shape)
display(df.head())
df.info()
df.describe()

In [None]:
df_column_names=list(df.columns)
print(df_column_names)
print()
print('{} columns'.format(len(df_column_names)))

In [None]:
print('Data types of this dataset :')
print(list(df.dtypes.unique()))

In [None]:
categorical_type_columns=[]
numerical_type_columns=[]
for one_column_name in df:
    if 'object' in str(df[one_column_name].dtype):
        categorical_type_columns.append(one_column_name)
    elif 'float' in str(df[one_column_name].dtype):
        numerical_type_columns.append(one_column_name)

print(categorical_type_columns)
print()
print(numerical_type_columns)
print()
print('Categorical type columns : {} / {}'.format(len(categorical_type_columns),len(df.columns)))
print('Numerical type columns : {} / {}'.format(len(numerical_type_columns),len(df.columns)))

In [None]:
# Check mean and variance pattern of numerical type data 
# on artificial normal distribution generated by mean and variance

fig=plt.figure(figsize=(10,10))
gs=fig.add_gridspec(4,4)
ax=[None for _ in range(16)]

int_temp=0
for i in range(4):
    for j in range(4):
        ax[int_temp]=fig.add_subplot(gs[i,j]) 
        mean,std=df.describe().iloc[1,int_temp],df.describe().iloc[2,int_temp]
        s=np.random.normal(mean,std,1000)
        count,bins=np.histogram(s,30,density=True)
        ax[int_temp].plot(bins, 1/(std * np.sqrt(2 * np.pi)) * np.exp( - (bins - mean)**2 / (2 * std**2) ),linewidth=2, color='r')
        ax[int_temp].set_title('{}'.format(df_column_names[int_temp]))
        int_temp+=1

plt.tight_layout()
plt.show()

In [None]:
number_of_rows=df.shape[0]
number_of_nan_in_column=df.isnull().sum(axis=0)
print(pd.concat([number_of_nan_in_column,(number_of_nan_in_column/number_of_rows*100).round(1)],axis=1).rename(columns={0:'Number of NaN',1:'Number of NaN in %'}))

In [None]:
df=df.dropna(subset=['RainTomorrow'])

## 2. Check the class imbalance <a class="anchor" id="2"></a>

In [None]:
df['RainTomorrow'].value_counts().plot(kind='bar')

## 3. Check the cardinality of categorical columns <a class="anchor" id="3"></a>

In [None]:
print("Categorical column cardinality :")
for var in categorical_type_columns:
    print('{} : {} labels'.format(var,len(df[var].unique())))

## 4. Process date data <a class="anchor" id="4"></a>

In [None]:
df['Date']=df['Date'].apply(pd.to_datetime)

In [None]:
target_attributes=["year","month","day"]
for one_target_attribute in target_attributes:
    new_column_name=one_target_attribute[0].upper()+one_target_attribute[1:]
    df[new_column_name]=getattr(df['Date'].dt,one_target_attribute)

In [None]:
df=df.drop(columns=['Date'])

## 6. Check correlation of features <a class="anchor" id="6"></a>

In [None]:
plt.subplots(figsize=(20, 15))
ax = sns.heatmap(df.corr(), square=True, annot=True, fmt='.2f', linecolor='white')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=30)           
plt.show()

In [None]:
df.corr()>0.7
correlation_df_columns=df.corr().columns
for one_column_name in correlation_df_columns:
    one_column_data=df.corr()[one_column_name]
    high_positively_correlated=one_column_data[one_column_data>0.7]
    del high_positively_correlated[one_column_name]
    if high_positively_correlated.shape[0]==0:
        continue
    print(one_column_name)
    print(high_positively_correlated)
    print()


## 7. Compose dataset for machine learning model <a class="anchor" id="7"></a>

In [None]:
X=df.drop(columns=['RainTomorrow'])
y=df['RainTomorrow']

## 8. Make 5 folds stratified cross validation loop <a class="anchor" id="8"></a>

In [None]:
numerical_type_columns_temp=copy.deepcopy(numerical_type_columns);numerical_type_columns_temp.append("Year");numerical_type_columns_temp.append("Month");numerical_type_columns_temp.append("Day")
categorical_type_columns_temp=copy.deepcopy(categorical_type_columns);categorical_type_columns_temp.remove("Date");categorical_type_columns_temp.remove("Location");categorical_type_columns_temp.remove("RainTomorrow")

stratified_k_fold_object=StratifiedKFold(n_splits=5) 
for fold_index,(train_indices,validation_indices) in enumerate(stratified_k_fold_object.split(X,y)):
    print("=========={} fold==========".format(fold_index+1))

#     ================================================================================
    X_train=X.iloc[train_indices]
    y_train=y.iloc[train_indices]
    X_validation=X.iloc[validation_indices]
    y_validation=y.iloc[validation_indices]
    
    X_columns=X_train.columns
    
#     ================================================================================
#     8.1. Resample (oversampling) on train dataset

    merged_train=pd.concat([X_train,y_train],axis=1)

    no_df = merged_train[merged_train.RainTomorrow == "No"]
    yes_df = merged_train[merged_train.RainTomorrow == "Yes"]

    # Resample (oversampling)
    yes_oversampled_df = resample(yes_df, replace=True, n_samples=len(no_df), random_state=123)

    # Concat oversampled one and existing one
    oversampled_train_df = pd.concat([no_df, yes_oversampled_df])
    
    y_train=oversampled_train_df['RainTomorrow']
    X_train=oversampled_train_df.drop(columns=['RainTomorrow'])
    
#     ================================================================================    
#     8.2. Manage null values

    median_data=[]
    for one_numerical_column_name in numerical_type_columns_temp:
        median_value=X_train[one_numerical_column_name].median()
        
#         Fill median value of train data on train data
        X_train[one_numerical_column_name].fillna(median_value,inplace=True)

#         Fill median value of train data on validation data
        X_validation[one_numerical_column_name].fillna(median_value,inplace=True)

#     ================================================================================
#     8.3. Impute missing categorical variables with most frequent value
    
    for one_column in categorical_type_columns_temp:
        X_train[one_column].fillna(X_train[one_column].mode()[0], inplace=True)
        X_validation[one_column].fillna(X_validation[one_column].mode()[0], inplace=True)

#     ================================================================================
#     8.4. One hot encoding

#     Create model
    one_hot_encoder_object=OneHotEncoder()

#     Fit
    one_hot_encoder_object.fit(X_train[categorical_type_columns_temp])

#     Transform
    X_train_one_hot_part=one_hot_encoder_object.transform(X_train[categorical_type_columns_temp]).toarray()
    X_validation_one_hot_part=one_hot_encoder_object.transform(X_validation[categorical_type_columns_temp]).toarray()

#     ================================================================================
#     8.5. Scaling (min max scaling)

#     Create model
    minmax_scaler=MinMaxScaler()
    
    # Fit
    minmax_scaler.fit(X_train[numerical_type_columns])

    # Transform
    X_train_numerical_scaled=minmax_scaler.transform(X_train[numerical_type_columns])
    X_validation_numerical_scaled=minmax_scaler.transform(X_validation[numerical_type_columns])
    
#     Concatenate
    X_train_np=np.concatenate((X_train_one_hot_part, X_train_numerical_scaled), axis=1)
    X_validation_np=np.concatenate((X_validation_one_hot_part, X_validation_numerical_scaled), axis=1)
    
#     ================================================================================
#     8.6. Train

#     Create model
    logreg = LogisticRegression(solver='liblinear', random_state=0)

    # Fit
    logreg.fit(X_train_np, y_train)

#     ================================================================================
#     8.7. Measure performance of prediction on validation dataset

    y_pred_validation = logreg.predict(X_validation_np)
    print('Model accuracy score: {0:0.4f} with validation dataset'. format(accuracy_score(y_validation, y_pred_validation)))
    
    y_pred_train=logreg.predict(X_train_np)
    print('Model accuracy score: {0:0.4f} with train dataset'. format(accuracy_score(y_train, y_pred_train)))

    print('tn fp\nfn tp\n',confusion_matrix(y_validation,y_pred_validation))
    tn=confusion_matrix(y_validation,y_pred_validation)[0,0]
    fp=confusion_matrix(y_validation,y_pred_validation)[0,1]
    fn=confusion_matrix(y_validation,y_pred_validation)[1,0]
    tp=confusion_matrix(y_validation,y_pred_validation)[1,1]
    print('accuracy : ',(tp+tn)/(tp+fp+tn+fn))
    print('recall (sensitivity,true positive rate) : ',tp/(tp+fn))
    print('specificity : ',tn/(tn+fp))
    print('precision : ',tp/(tp+fp))
    print('f1-score : ',((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn))))
    
    print()

## 9. Train model with full train dataset <a class="anchor" id="9"></a>

In [None]:
# ================================================================================
# Resample train dataset (oversampling)

merged_train=pd.concat([X,y],axis=1)

no_df = merged_train[merged_train.RainTomorrow == "No"]
yes_df = merged_train[merged_train.RainTomorrow == "Yes"]

# Resample (oversampling)
yes_oversampled_df = resample(yes_df, replace=True, n_samples=len(no_df), random_state=123)

# Concat oversampled one and existing one
oversampled_train_df = pd.concat([no_df, yes_oversampled_df])

y_train=oversampled_train_df['RainTomorrow']
X_train=oversampled_train_df.drop(columns=['RainTomorrow'])
    
# ================================================================================    
# Manage null values

median_data=[]
for one_numerical_column_name in numerical_type_columns_temp:
    median_value=X_train[one_numerical_column_name].median()

#     Fill median value of train data on train data
    X_train[one_numerical_column_name].fillna(median_value,inplace=True)

# ================================================================================
# Impute missing categorical variables with most frequent value
    
for one_column in categorical_type_columns_temp:
    X_train[one_column].fillna(X_train[one_column].mode()[0], inplace=True)

# ================================================================================
# Create model
one_hot_encoder_object=OneHotEncoder()

# Fit
one_hot_encoder_object.fit(X_train[categorical_type_columns_temp])

# Transform
X_train_one_hot_part=one_hot_encoder_object.transform(X_train[categorical_type_columns_temp]).toarray()

# ================================================================================
# Create model
minmax_scaler=MinMaxScaler()
    
# Fit
minmax_scaler.fit(X_train[numerical_type_columns])

# Transform
X_train_numerical_scaled=minmax_scaler.transform(X_train[numerical_type_columns])
    
# Concatenate
X_train_np=np.concatenate((X_train_one_hot_part, X_train_numerical_scaled), axis=1)
    
# ================================================================================
# Create model
logreg = LogisticRegression(solver='liblinear', random_state=0)

# Fit
logreg.fit(X_train_np, y_train)

# ================================================================================
# Measure performance of prediction on validation dataset

y_pred_train=logreg.predict(X_train_np)
print('Model accuracy score: {0:0.4f} with train dataset'. format(accuracy_score(y_train, y_pred_train)))

print('tn fp\nfn tp\n',confusion_matrix(y_train, y_pred_train))
tn=confusion_matrix(y_train, y_pred_train)[0,0]
fp=confusion_matrix(y_train, y_pred_train)[0,1]
fn=confusion_matrix(y_train, y_pred_train)[1,0]
tp=confusion_matrix(y_train, y_pred_train)[1,1]
print('accuracy : ',(tp+tn)/(tp+fp+tn+fn))
print('recall (sensitivity,true positive rate) : ',tp/(tp+fn))
print('specificity : ',tn/(tn+fp))
print('precision : ',tp/(tp+fp))
print('f1-score : ',((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn))))