In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# libraries to read the data and perform mathematical operations
import pandas as pd
import numpy as np

# libraries to visualise the data
import matplotlib.pyplot as plt
import seaborn as sns


# **Importing the Data**

In [None]:
data = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
data.head(10)

# Checking the data characteristics

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.describe(include = object).T

# Missing Value Analysis

In [None]:
data.isnull().sum().sort_values(ascending = False).plot(kind = 'bar')

In [None]:
# the target column, RainTomorrow is also having missing values

In [None]:
data.isnull().sum()/len(data) * 100

In [None]:
# the amount of missing values in these columns > 20%
# Evaporation      43.166506
# Sunshine         48.009762
# Cloud9am         38.421559
# Cloud3pm         40.807095

# Hence, due to data insufficiency,dropping these columns
data.drop(columns = ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], inplace = True)

# Feature Extraction

In [None]:
# extracting the year and month from the date column since not applying times series analysis on this data
data['Year'] = data['Date'].apply(lambda x : x.split('-')[0]).astype(int)
data['Month'] = data['Date'].apply(lambda x : x.split('-')[1]).astype(int)

In [None]:
data.drop(columns = 'Date', inplace = True)

In [None]:
data['Year'].value_counts()

In [None]:
data['Month'].value_counts()

# Checking the target class

In [None]:
data['RainTomorrow'].value_counts().plot(kind = 'bar')

In [None]:
# the target set is imbalanced
# it would be labelled highly imbalanced if one class is < 10%

In [None]:
data['RainTomorrow'].value_counts()/len(data) *100

# Segregating the numerical and categorical columns

In [None]:
# segregating the numerical and categorical columns
num_data = data.select_dtypes(include = np.number)
cat_data = data.select_dtypes(include = object)

In [None]:
num_data.head(5)

In [None]:
cat_data.head(5)

# Missing Value Imputation

In [None]:
num_data.isnull().sum()

In [None]:
plt.figure(figsize = (20,12))
num_data.hist(bins = 100)
plt.tight_layout()

In [None]:
# the columns are near-normally distributed, with very less skewness, so not much transformation is required, except for the column Rainfall

In [None]:
plt.figure(figsize = (10,5))
sns.histplot(num_data['Rainfall'], bins = 100)

In [None]:
# Standard Scaler is used when the shape of the distribution is near normal. It preserves the shape of the distribution
# Min-Max scaler is used when we need to preserve the effect of the outliers
# Standard Scaler is preferred for this case
# To remove the effect of outliers, need to apply Robust Scaler

In [None]:
sns.boxplot(data = num_data, orient = 'h')

In [None]:
for col in num_data.columns:
    plt.figure(figsize = (10,5))
    sns.boxplot(x = col, data = num_data)
    plt.show()
    


In [None]:
# the classification algorithms which work using Likelihood Estimation, are affected by outliers.
# Tree-based algorithms are not affected by outliers
# Outliers make the classification model actually better.

In [None]:
for col in cat_data:
    plt.figure(figsize = (20,5))
    sns.countplot(x = col, data = cat_data, hue = 'RainTomorrow', palette='rainbow')
    plt.legend(loc = 'best')
    plt.xticks(rotation  = 90 )
    plt.show()

In [None]:
# when lat-long values are given, multiply them to use as a new column
# when know from domain expertise that location is neccesary to predict rainfall
# but the problem is that the number of locations is so long, that it is not possible to label encode these values manually
# so, we use dummy encoding; so that certain algorithms like ensemble techniques will work well on these encoded locations

In [None]:
data.groupby(['Location']).describe(include = object)

In [None]:
# All the categorical columns in this dataset are ordinal but still we could go with dummy variable encoding. 
# It won't affect the performance of a modern ML algorithm.
# Because, modern day ML algorithms are not entirely affected by the differences in dummy encoding and label encoding, in general.
# when to choose which, comes from domain expertise

In [None]:
# segregating the target column
y = cat_data['RainTomorrow'].values
cat_data.drop(columns = 'RainTomorrow', inplace = True)

In [None]:
# concatenating the numerical and categorical data to get the feature set
X = pd.concat([num_data, cat_data], axis = 1)

# splitting into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, )

In [None]:
# Segregating the training and test sets into numerical and categorical data so as to apply different operations on each
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)

X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)

In [None]:
# Saving the names of the numerical and categorical columns to be added later
num_cols = X_train_num.columns
cat_cols = X_train_cat.columns

# Missing Value imputation

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# imputing the numerical missing values by the median
imputer = SimpleImputer(strategy='median')

X_train_num = pd.DataFrame(imputer.fit_transform(X_train_num),columns=num_cols)
X_test_num = pd.DataFrame(imputer.transform(X_test_num), columns = num_cols)

In [None]:
# imputing the categorical missing values by the mode
imputer = SimpleImputer(strategy='most_frequent')

X_train_cat = pd.DataFrame(imputer.fit_transform(X_train_cat),columns=cat_cols)
X_test_cat  = pd.DataFrame(imputer.transform(X_test_cat), columns = cat_cols)

# Scaling the numerical columns

In [None]:
# Scaling the numerical data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_num = pd.DataFrame(scaler.fit_transform(X_train_num), columns = num_cols)
X_test_num = pd.DataFrame(scaler.transform(X_test_num), columns = num_cols)

In [None]:
# dummy encoding the training categorical data
X_train_cat  =pd.get_dummies(X_train_cat, drop_first=True)

In [None]:
# checking the encoded training categorical data
X_train_cat.head(3)

In [None]:
# dummy encoding the test categorical data
X_test_cat  =pd.get_dummies(X_test_cat, drop_first=True)

# Concatenating the categorical and numerical data to get the final train and test sets

In [None]:
# Creating the training dataset
X_train = pd.concat([X_train_num, X_train_cat], axis = 1)

In [None]:
# Creating the test dataset
X_test = pd.concat([X_test_num, X_test_cat], axis = 1)

In [None]:
# Checking the shape of the newly created datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# How to ascertain that the train-test split is dividing the data correctly. Ans: Check the mean and std for both test and train

In [None]:
X_train.describe().T[['mean', 'std']]

In [None]:
X_test.describe().T[['mean', 'std']]

In [None]:
y_train = y_train.reshape(-1)

In [None]:
y_train.shape

In [None]:
# Imputing the missing values in target variable by the mode
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
y_train = imputer.fit_transform(y_train.reshape(-1,1))
y_test = imputer.transform(y_test.reshape(-1,1))

In [None]:
y_train.shape

In [None]:
X_train.shape, y_train.shape

In [None]:
# Label encoding the target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train.ravel())

y_test = encoder.transform(y_test.ravel())

In [None]:
y_train.shape

# Creating a scorecard to compare different models

In [None]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score

In [None]:
scorecard = pd.DataFrame(columns = ['Estimator', 'f1_score', 'Accuracy', 'Precision', 'ROC_AUC_Score'])

In [None]:
def update_score (estimator):
    global scorecard 
    name = estimator.__class__.__name__
    y_pred = estimator.predict(X_test)
    y_pred_proba = estimator.predict_proba(X_test)
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred_proba[:,1])
    prec = precision_score(y_test, y_pred)
    scorecard = scorecard.append({'Estimator':name, 'f1_score': f1, 'Accuracy':acc, 'Precision': prec,'ROC_AUC_Score':roc} ,
                                       ignore_index=True)
    return(scorecard)

In [None]:
def plot_roc_curve(estimator):
    y_pred_proba = estimator.predict_proba(X_test)
    tpr, fpr, thres = roc_curve(y_test, y_pred_proba[:,1])
    
    plt.figure(figsize = (12,7))
    
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.plot([0,1], [0,1], '--')
    
    plt.plot(tpr, fpr, label = estimator.__class__.__name__ %roc_auc_score(y_test, y_pred_proba[:,1]) )
    plt.legend()
    plt.xlabel('False positive rate (1-Specificity)', fontsize = 12)
    plt.ylabel('True positive rate (Sensitivity)', fontsize = 12)

# creating the DecisionTree model

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtf = DecisionTreeClassifier()

dtf.fit(X_train, y_train)
scorecard = update_score(dtf)
plot_roc_curve(dtf)

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

In [None]:
X_train_f = X_train.values

In [None]:
for train_idx, test_idx in kf.split(X_train_f):
    X_train_idx, X_test_idx = X_train_f[train_idx], X_train_f[test_idx]
    y_train_idx, y_test_idx = y_train[train_idx], y_train[test_idx]
    dtf.fit(X_train_idx, y_train_idx)
    update_score(dtf)

# Running different models

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

logreg.fit(X_train, y_train)
# y_pred_proba = logreg.predict_proba(X_test)
update_score(logreg)
plot_roc_curve(logreg)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
