# Importing Library and Datasets

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection  import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

# Checking Statistics 

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Visualising/Feature Handling

In [None]:
# Replacing string of Yes or No with 1 or 0 

df["RainTomorrow"].replace({"Yes": "1", "No": "0"}, inplace=True)

In [None]:
# Dropping all the missing values on dependent variable

df.dropna(subset = ["RainTomorrow"], inplace=True)

In [None]:
# Changing datatype to integer

df = df.astype({"RainTomorrow": int})

In [None]:
df["RainTomorrow"]

### Visualizing the correlations between numerical features of the data.



In [None]:
plt.style.use("ggplot")
f,ax=plt.subplots(figsize = (10,8))
sns.heatmap(df.corr(),robust=True,fmt='.2f',linewidths=1.3,linecolor = 'gold', annot=True,);

In [None]:
col = ['Temp9am', 'Temp3pm']

df = df.drop(col, axis=1)

### Getting list of numeric and categorical column

In [None]:
numeric_col = df.select_dtypes(include='float64').columns
categorical_col = df.select_dtypes(include='object').columns

In [None]:
numeric_col

In [None]:
categorical_col

### Visualizing the countplot of Categorical Variable

In [None]:
fig, ax = plt.subplots(4,1 ,figsize=(12, 16))

sns.countplot(x = df['WindGustDir'], ax=ax[0])
sns.countplot(x = df['WindDir9am'], ax=ax[1])
sns.countplot(x = df['WindDir3pm'], ax=ax[2])
sns.countplot(x = df['RainToday'], ax=ax[3])

fig.show()

In [None]:
def plot(df, x):
  g = sns.FacetGrid(df, col = "RainTomorrow")
  g.map(sns.histplot, x, stat = 'probability')
  plt.show()

for i in numeric_col:
  plot(df, i)

### Checking outliers through box-plot

In [None]:
# view summary statistics in numerical variables

print(round(df[numeric_col].describe()),2)

On closer inspection, we can see that the Rainfall, Evaporation, WindSpeed9am and WindSpeed3pm columns may contain outliers.

We can draw boxplots to visualise outliers in the above variables.

In [None]:
def plot(df, col):
  sns.boxplot(x='RainTomorrow', y=col, data=df).set_title("Box plot for {}".format(col))
  plt.show()

for i in numeric_col:
  plot(df, i)

We can use interquantile range to find outliers in the features we think contains outliers.

In [None]:
# find outliers for Rainfall variable

IQR = df.Rainfall.quantile(0.75) - df.Rainfall.quantile(0.25)
Lower_fence = df.Rainfall.quantile(0.25) - (IQR * 3)
Upper_fence = df.Rainfall.quantile(0.75) + (IQR * 3)
print('Rainfall outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for Evaporation variable

IQR = df.Evaporation.quantile(0.75) - df.Evaporation.quantile(0.25)
Lower_fence = df.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = df.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for WindSpeed9am variable

IQR = df.WindSpeed9am.quantile(0.75) - df.WindSpeed9am.quantile(0.25)
Lower_fence = df.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for WindSpeed3pm variable

IQR = df.WindSpeed3pm.quantile(0.75) - df.WindSpeed3pm.quantile(0.25)
Lower_fence = df.WindSpeed3pm.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed3pm.quantile(0.75) + (IQR * 3)
print('WindSpeed3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

As we see there are many outliers in many features, but we won't gonna bother removing it since logistic regression is not prone to outliers.

### Check the distribution of variables

In [None]:
def plot(df, col):

  plt.figure(figsize=(8,4))
  fig = df[col].hist(bins=10)
  fig.set_xlabel(col)
  fig.set_ylabel('RainTomorrow')

for i in numeric_col:
  plot(df, i)

### Feature Engineering of Date Variable

In [None]:
# parse the dates, currently coded as strings, into datetime format

df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# extract year,month and day from date

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [None]:
# drop the original Date variable

df.drop('Date', axis=1, inplace = True)

In [None]:
df.head()

### Handling Missing Values

#### Numerical Columns

In [None]:
# check missing values in numerical variables

df[numeric_col].isnull().sum()


We can use Linear Interpolation. It's a technique use to handle the missing value in Numerical variables because Linear interpolation is an imputation technique that assumes a linear relationship between data points and utilizes non-missing values from adjacent data points to compute a value for a missing data point.

In [None]:
for i in numeric_col:
  df[i] = df[i].interpolate(method='linear')

In [None]:
df['Evaporation'] = df['Evaporation'].interpolate(method='linear', axis=0).ffill().bfill()
df['Sunshine'] = df['Sunshine'].interpolate(method='linear', axis=0).ffill().bfill()
df['Cloud3pm'] = df['Cloud3pm'].interpolate(method='linear', axis=0).ffill().bfill()

#### Categorical Columns

In [None]:
categorical_col = categorical_col[1::]

In [None]:
# check missing values in Categorical variables

df[categorical_col].isnull().sum()

In [None]:
# Checking numbers of unique values

for i in categorical_col:
  print("Number of unique values for {} : {}".format(i,len(df[i].unique())))

Apply one-hot encoding by taking dummy variables would solve the problem of missing values as it will ignore the NaN values.

In [None]:
one_hot_col = categorical_col

In [None]:
df = pd.get_dummies(df, columns = one_hot_col)

In [None]:
df.columns

# Model Building

Building the dataset

In [None]:
X = df.drop('RainTomorrow', axis = 1)
y = df['RainTomorrow']

In [None]:
X = StandardScaler().fit_transform(X)

## Splitting dataset

In [None]:
X_1,X_test,y_1, y_test = train_test_split(X, y,test_size=0.3, stratify=y)
X_train, X_cv, y_train, y_cv = train_test_split(X_1, y_1, test_size=0.3, stratify=y_1)

In [None]:
C = [10**-3, 10**-2, 10**0, 10**2,10**3,10**4]#C=1/lambda
auc_train=[]
auc_cv=[]
for c in C:
    lr=LogisticRegression(penalty='l2',C=c)
    lr.fit(X_train,y_train)
    probcv=lr.predict_proba(X_cv)[:,1]
    auc_cv.append(roc_auc_score(y_cv,probcv))
    probtr=lr.predict_proba(X_train)[:,1]
    auc_train.append(roc_auc_score(y_train,probtr))
optimal_c = C[auc_cv.index(max(auc_cv))]
C=[math.log(x) for x in C]#converting values of C into logarithm
fig = plt.figure()
ax = plt.subplot(111)
ax.plot(C, auc_train, label='AUC train')
ax.plot(C, auc_cv, label='AUC CV')
plt.title('AUC vs hyperparameter')
plt.xlabel('C (1/lambda)')
plt.ylabel('AUC')
ax.legend()
plt.show()
print('optimal lambda for which auc is maximum : ',1//optimal_c)

In [None]:
lr=LogisticRegression(penalty='l2',C=optimal_c)
lr.fit(X_train,y_train)
predi=lr.predict_proba(X_test)[:,1]
fpr1, tpr1, thresholds1 = roc_curve(y_test, predi)
pred=lr.predict_proba(X_train)[:,1]
fpr2,tpr2,thresholds2 = roc_curve(y_train,pred)
fig = plt.figure()
ax = plt.subplot(111)
ax.plot(fpr1, tpr1, label='Test ROC ,auc='+str(roc_auc_score(y_test,predi)))
ax.plot(fpr2, tpr2, label='Train ROC ,auc='+str(roc_auc_score(y_train,pred)))
plt.title('ROC')
plt.xlabel('FPR')
plt.ylabel('TPR')
ax.legend()
plt.show()

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
lr=LogisticRegression(penalty='l2',C=optimal_c)
lr.fit(X_train,y_train)
predic=lr.predict(X_test)

conf_mat = confusion_matrix(y_test, predic)
class_label = ["negative", "positive"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
sns.heatmap(df, annot = True,fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

## Classification Report

In [None]:
cr = classification_report(predic, y_test)
print(cr)