In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

In [None]:
df.info()

In [None]:
### Checking the missing values

df['RainTomorrow'].isnull().sum()

In [None]:
df.dropna(subset= ['RainTomorrow'], inplace=True)

# df['RainTomorrow'].replace({'Yes': 1, 'No': 0},inplace=True)
# df['RainToday'].replace({'Yes': 1, 'No': 0},inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
ax = df['RainTomorrow'].value_counts().plot(kind='bar')
for patch in ax.patches:
    ax.text(x=patch.get_x() + patch.get_width()/2, y=patch.get_height()/2, 
           s= f"{np.round((patch.get_height()/len(df))*100, 1)}%",
           ha='center', size=20, color='white')

> Since the minority class is more than 10% its in the minority class so this does not belong to the class imbalance problem

In [None]:
## Treating the Date column

df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.head()

In [None]:
df['Date'].dt.month.describe()

In [None]:
def encode_cyclic(df, col):
    df[col + '_sin'] = np.sin(2*np.pi*df[col]/df[col].max())
    df[col + '_cos'] = np.cos(2*np.pi*df[col]/df[col].max())
    return df

In [None]:
### Encoding Day and Month features 

df['Day'] = df['Date'].dt.day
df = encode_cyclic(df,'Day')

df['Month'] = df['Date'].dt.month
df = encode_cyclic(df,'Month')

In [None]:
df.Month_cos.describe()

In [None]:
plt.style.use('seaborn')

fig, ax = plt.subplots(1,3, figsize=(12,5), constrained_layout=True)

p1 = sns.lineplot(df['Month'], df['Day'], estimator=None, ax=ax[0])
p2 = sns.scatterplot(df.Month_sin, df.Month_cos, ax=ax[1])
p3 = sns.scatterplot(df.Day_sin, df.Day_cos, ax=ax[2])

p1.set_title('Original Distribution of Day and Month')
p2.set_title('Distribution of Day_sin and Day_cos')
p3.set_title('Distribution of Month_sin and Month_cos')

plt.suptitle('Distribution of Day and Month', size=20, ha='center')

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size=0.75, stratify=df['RainTomorrow'], random_state=21)
print("-------------------------- ###### Train Data ###### -------------------------------------")
display(df_train.head())
print("-------------------------- ###### Test Data ###### -------------------------------------")
display(df_test.head())

In [None]:
train_counts = df_train.RainTomorrow.value_counts()
test_counts = df_test.RainTomorrow.value_counts()
display(train_counts)
print('####################################################')
display(test_counts)

In [None]:
print(f'Train Counts Ratio:  {train_counts[0]/train_counts[1]}')
print(f'Test Counts Ratio:  {test_counts[0]/test_counts[1]}')

The Ratio is almost the same because we used stratify for *RainTomorrow* column.

In [None]:
## Extracting the Numerical and the Categorical Features

numerical = df.select_dtypes(include='float64').columns.to_list()
categorical = df.select_dtypes(include='object').columns.to_list()

categorical, numerical

### Handeling Categorical Columns

In [None]:
def mod_remove(lst, rem_lst):
    return [e for e in lst if e not in rem_lst]

print(categorical)
categorical = mod_remove(categorical,['RainTomorrow','Date'])
print(categorical)

In [None]:
df[categorical].head()

In [None]:
### Checking the number of missing values in categorical columns 
print(df_train[categorical].isnull().mean() * 100)
ax = df_train[categorical].isnull().mean().sort_values(ascending=False).plot(kind='bar')
ax.set_title('Countplot of missing values', size=18)

Since the missing values are less than 10% so we will impute them with mode.

In [None]:
df_train['WindDir9am'].mode()[0]

In [None]:
for col in categorical:
    df_train[col].fillna(df_train[col].mode()[0], inplace=True)
    df_test[col].fillna(df_train[col].mode()[0], inplace=True)

In [None]:
df_train[categorical].isnull().sum()

### Handeling Missing Values of Numerical Features

In [None]:
numerical

In [None]:
print(df_train.isnull().mean().sort_values(ascending=False))

plot = df_train.isnull().mean().sort_values(ascending=False).plot(kind='bar')

plot.set_title("Value Count of Missing Values for Numerical Features", size=18)

In [None]:
### Features with highest Missing values
top_four = df_train[numerical].isnull().mean().sort_values(ascending=False).index.to_list()[:4]
top_four

In [None]:
plt.style.use('seaborn-dark')
plot, ax = plt.subplots(4,2, figsize=(12,10), constrained_layout = True)

for i,col in enumerate(top_four):
    sns.kdeplot(df_train[col], ax = ax[i][0], fill=True, alpha=0.6,linewidth=1.5)
    ax[i][0].set_ylabel(col)
    ax[i][0].set_xlabel(None)
    ax[i][0].grid(False)
    
    sns.histplot(df_train[col], ax = ax[i][1], fill=True, alpha=0.6,linewidth=1.5)
    ax[i][1].set_ylabel(None)
    ax[i][1].set_xlabel(None)
    ax[i][1].grid(False)
    
plot.suptitle("Distribution of Features with more than 30% outliers", size=20)

> we can see that except ***Evaporation*** every other column is well distributed. so we Can impute ***Evaporation*** with mean and the others with Median.

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
for dataf in [df_train, df_test]:
    for col in [i for i in top_four if i != 'Evaporation']:
        dataf[col].fillna(df_train[col].median(), inplace=True)
        
    dataf['Evaporation'].fillna(df_train['Evaporation'].mean(), inplace=True)

In [None]:
df_train[numerical].isnull().mean().sort_values(ascending=False)

In [None]:
for dataf in [df_train, df_test]:
    for col in numerical:
        dataf[col].fillna(dataf[col].mean(),inplace=True)

In [None]:
df_train[numerical].isnull().mean().sort_values(ascending=False)

All the missing values have now been handelled.

### Handeling MultiCollinearity

In [None]:
### Plotting the Heatmap of the numerical features
plt.figure(figsize=(16,12))

sns.heatmap(df_train[numerical].corr(), annot=True, linewidths=2, cmap='Blues', fmt='.0%', square=True)

- 4 pairs of most correlated features
* `Temp9am` and `MinTemp`
* `Temp9am` and `MaxTemp`
* `Temp3am` and `MaxTemp`
* `Temp3am` and `Temp9am`
* `Pressure3am` and `Pressure9am`

In [None]:
numerical

In [None]:
for dataf in [df_train, df_test]:
    dataf.drop(['Temp3pm','Temp9am','Pressure3pm'], axis=1, inplace=True)

In [None]:
df_train.columns

In [None]:
numeric = mod_remove(numerical, ['Temp3pm','Temp9am','Pressure3pm','Day_sin','Day_cos','Month_sin','Month_cos'])
numeric

### Checking The Outliers of Numerical Features

In [None]:
plt.style.use('seaborn-dark')

plot, ax = plt.subplots(13, 2, figsize=(12, 24), constrained_layout=True)

for i,num in enumerate(numeric):
    sns.boxplot(y = df_train[num], color= 'orange', ax=ax[i][0])
    
    sns.kdeplot(x = df_train[num], color='orange',fill=True, ax=ax[i][1])
    
    ax[i][0].set_ylabel(f'{num}', fontsize=10)
    ax[i][1].set_ylabel(None)
    ax[i][0].set_xlabel(None)
    ax[i][1].set_xlabel(None)
    

In [None]:
df_train['MinTemp'][df_train['MinTemp'] > 12]

In [None]:
def handle_outliers(df, col):
    Q1 = np.percentile(df[col], 25)
    Q3 = np.percentile(df[col], 75)
    IQR = Q3-Q1
    mini = Q1 - 1.5*IQR
    maxi = Q3 + 1.5*IQR
    df[col][df[col] >= maxi] = maxi
    df[col][df[col] <= mini] = mini

In [None]:
for col in numeric:
    handle_outliers(df_train, col)

In [None]:
plt.style.use('seaborn-dark')

plot, ax = plt.subplots(13, 2, figsize=(12, 24), constrained_layout=True)

for i,num in enumerate(numeric):
    sns.boxplot(y = df_train[num], color= 'orange', ax=ax[i][0])
    
    sns.kdeplot(x = df_train[num], color='orange',fill=True, ax=ax[i][1])
    
    ax[i][0].set_ylabel(f'{num}', fontsize=10)
    ax[i][1].set_ylabel(None)
    ax[i][0].set_xlabel(None)
    ax[i][1].set_xlabel(None)

### Transforming the features

In [None]:
df_train.head()

In [None]:
for dataf in [df_train, df_test]:
    dataf.drop(['Date','Day', 'Month'],inplace=True,axis=1)

In [None]:
df_train.head()

In [None]:
for dataf in [df_train, df_test]:
    dataf['RainTomorrow'].replace({'No':0, 'Yes':1},inplace=True)

In [None]:
df_train.head()

In [None]:
X_train = df_train.drop(['RainTomorrow'],axis=1)
y_train = df_train['RainTomorrow']

X_test = df_test.drop(['RainTomorrow'],axis=1)
y_test = df_test['RainTomorrow']

In [None]:
X_train = pd.get_dummies(X_train, drop_first=True).reset_index(drop=True)
X_test = pd.get_dummies(X_test, drop_first=True).reset_index(drop=True)

In [None]:
X_train.head()

In [None]:
X_train.columns.to_list()

In [None]:
numeric = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
               'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
               'Humidity3pm', 'Pressure9am', 'Cloud9am', 'Cloud3pm',
               'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos']

category = [i for i in X_train.columns if i not in numeric]
category

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()

X_train_fit = pd.DataFrame(scalar.fit_transform(X_train[numeric]), columns=numeric)
X_test_fit = pd.DataFrame(scalar.transform(X_test[numeric]), columns=numeric)

In [None]:
X_train_final = pd.concat([X_train_fit, X_train[category]],axis=1)
X_test_final = pd.concat([X_test_fit, X_test[category]],axis=1)

In [None]:
X_train_final.head()

In [None]:
X_train_final.dtypes.values

### Training the Model

In [None]:
acc_store = dict()

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')

model.fit(X_train_final,y_train)

In [None]:
y_pred = model.predict(X_test_final)

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, y_pred))

In [None]:
acc_store["Logistic Regression"] = accuracy_score(y_test, y_pred)
acc_store

In [None]:
# import tensorflow as tf

# model = tf.keras.Sequential({
#     tf.keras.layers.Flatten(input_shape=X_train_final.shape),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1,activation='sigmoid')
# })

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# history = model.fit(X_train_final,y_train, epochs=10, )
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout

# Creating the ANN
model = Sequential()

# layers
model.add(Dense(units = 1024, kernel_initializer = 'uniform', activation = 'relu', input_dim = X_train_final.shape[1]))
model.add(Dense(units = 512, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(units = 32, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy', keras.metrics.AUC()])

In [None]:
epochs = 20
batch_size = 32

history = model.fit(X_train_final, y_train, batch_size = batch_size, epochs = epochs,
                    validation_split = 0.3)

In [None]:
y_pred = model.predict_classes(X_test_final)
y_pred

In [None]:
acc_store['ANN'] = accuracy_score(y_pred, y_test)
acc_store

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=20,random_state=0)

model.fit(X_train_final,y_train)

In [None]:
y_pred = model.predict(X_test_final)
acc_store = accuracy_score(y_pred,y_test)
acc_store

> So Random forest gave the best accuracy.