### Importing Modules

Please UPVOTE if you find this Notebook insightful.

Thanks in advance.

In [None]:
# importing modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings(action="ignore")
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,roc_curve, roc_auc_score

### Importing Dataset

In [None]:
df = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")
df.head()

In [None]:
# Info of data
df.info()

In [None]:
# shape of data:
print(f'Number of columns: { df.shape[0]} and Number of rows: {df.shape[1]}')

In [None]:
# Checking for null values
df.isna().sum()

In [None]:
# statistical info of dataset
df.describe().T

In [None]:
# Identifying Continuous and Categorical Columns
category=[]
contin = []

for i in df.columns:
    if df[i].dtype =="object":
        category.append(i)
        
    else:
        contin.append(i)

print("Categorical:",category)
print("Continuous:", contin)

#### Visualizing **Missing values** in dataset

In [None]:
msno.matrix(df)

In [None]:
msno.heatmap(df)

In [None]:
df.head()

**Encoding RainToday and RainTomorrow Columns** using LabelEncoder

In [None]:
df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})
df['RainToday'] = df['RainToday'].map({'Yes': 1, 'No': 0})

In [None]:
df["RainToday"].unique()
df["RainTomorrow"].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df[["RainToday","RainTomorrow"]]

Percentage of **Null values in dataset**

In [None]:
(df.isnull().sum()/len(df))*100

In [None]:
df.head().T

In [None]:
df.columns

### Handling Null values

In [None]:
# filling the missing values for continuous variables with mean
df["MinTemp"]= df["MinTemp"].fillna(df["MinTemp"].mean())
df["MaxTemp"]= df["MaxTemp"].fillna(df["MaxTemp"].mean())
df["Evaporation"]= df["Evaporation"].fillna(df["Evaporation"].mean())
df["Sunshine"]= df["Sunshine"].fillna(df["Sunshine"].mean())
df["WindGustSpeed"]= df["WindGustSpeed"].fillna(df["WindGustSpeed"].mean())
df["Rainfall"]= df["Rainfall"].fillna(df["Rainfall"].mean())
df["WindSpeed9am"]= df["WindSpeed9am"].fillna(df["WindSpeed9am"].mean())
df["WindSpeed3pm"]= df["WindSpeed3pm"].fillna(df["WindSpeed3pm"].mean())
df["Humidity9am"]= df["Humidity9am"].fillna(df["Humidity9am"].mean())
df["Humidity3pm"]= df["Humidity3pm"].fillna(df["Humidity3pm"].mean())
df["Pressure9am"]= df["Pressure9am"].fillna(df["Pressure9am"].mean())
df["Pressure3pm"]= df["Pressure3pm"].fillna(df["Pressure3pm"].mean())
df["Cloud9am"]= df["Cloud9am"].fillna(df["Cloud9am"].mean())
df["Cloud3pm"]= df["Cloud3pm"].fillna(df["Cloud3pm"].mean())
df["Temp9am"]= df["Temp9am"].fillna(df["Temp9am"].mean())
df["Temp3pm"]= df["Temp3pm"].fillna(df["Temp3pm"].mean())

In [None]:
#Filling the missing values for continuous variables with mode
df['RainToday']=df['RainToday'].fillna(df['RainToday'].mode()[0])
df['RainTomorrow']=df['RainTomorrow'].fillna(df['RainTomorrow'].mode()[0])
df['WindDir9am'] = df['WindDir9am'].fillna(df['WindDir9am'].mode()[0])
df['WindGustDir'] = df['WindGustDir'].fillna(df['WindGustDir'].mode()[0])
df['WindDir3pm'] = df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0])

In [None]:
df.head()

In [None]:
# again checking for null values
(df.isnull().sum()/len(df))*100

### **Countplot** for RainToday and Raintomorrow:

In [None]:
fig, ax =plt.subplots(1,2)
plt.figure(figsize=(8,5))
sns.countplot(df["RainToday"],ax=ax[0])
sns.countplot(df["RainTomorrow"],ax = ax[1])

### Heatmap showing **Correlation** among attributes of data

In [None]:
#heatmap
plt.figure(figsize=(18,12))
sns.heatmap(df.corr(), annot=True)
plt.xticks(rotation=90)
plt.show()

**Inferences from Heatmap**:
* MinTemp and Temp9am highly correlated.
* MinTemp and Temp3pm highly correlated.
* MaxTemp and Temp9am highly correlated.
* MaxTemp and Temp3pm highly correlated.
* Temp3pm and Temp9am highly correlated.
* Humidity9am and Humidity3pm highly correlated.

In [None]:
#encoding remaining columns
df["Location"] = le.fit_transform(df["Location"])
df["WindDir9am"]= le.fit_transform(df["WindDir9am"])
df["WindDir3pm"]= le.fit_transform(df["WindDir3pm"])
df["WindGustDir"] = le.fit_transform(df["WindGustDir"])

In [None]:
df.head()

In [None]:
# Dropping highly correlated columns
df=df.drop(['Temp3pm','Temp9am','Humidity9am',"Date"],axis=1)
df.columns

In [None]:
from collections import Counter
os = SMOTE()
x, y = os.fit_resample(df.iloc[:,:-1], df.iloc[:,-1])
count = Counter(y)
print(count)

### Splitting data into Training and Testing Set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Selection

### XGBoost Model
* XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way.

In [None]:
xgbc = XGBClassifier(objective='binary:logistic')
xgbc.fit(X_train,y_train)

In [None]:
# accuracy of XGBoost Model
y_predxgb = xgbc.predict(X_test)
report = classification_report(y_test, y_predxgb)
print(report)
print("Accuracy of the XGBoost Model is:",accuracy_score(y_test,y_predxgb)*100,"%")
cm = confusion_matrix(y_test, y_predxgb)
sns.heatmap(cm, annot=True,cmap="YlGnBu")
plt.title("Confusion Matrix for XGBoost Model")
plt.show()

### LightGBM Model
* LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

1. Faster training speed and higher efficiency.
2. Lower memory usage.
3. Better accuracy.
4. Support of parallel and GPU learning.
5. Capable of handling large-scale data.

In [None]:
lightgbmc = LGBMClassifier(n_estimators=500,max_depth=6)
lightgbmc.fit(X_train, y_train)

In [None]:
# accuracy of LightGBM Model
y_pred = lightgbmc.predict(X_test)
report2 = classification_report(y_test, y_pred)
print(report2)
print("Accuracy of the LightGBM Model is:",accuracy_score(y_test,y_pred)*100,"%")
cm2 = confusion_matrix(y_test, y_pred)
sns.heatmap(cm2, annot=True,cmap="Blues")
plt.title("Confusion Matrix for LightGBM Model")
plt.show()

### Random Forest Classifier Model

In [None]:
classifier = RandomForestClassifier(n_estimators=300) # increasing number of trees for better accuracy
classifier.fit(X_train, y_train)

In [None]:
# accuracy of Random Forest Classifier Model
y_pred2 = classifier.predict(X_test)
report3 = classification_report(y_test, y_pred2)
print(report3)
print("Accuracy of the Random Forest Model is:",accuracy_score(y_test,y_pred2)*100,"%")
cm3 = confusion_matrix(y_test, y_pred2)
sns.heatmap(cm3, annot=True)
plt.title("Confusion Matrix for Random Forest Classifier Model")
plt.show()

## ROC_AUC Score and Curve
* The receiver operating characteristic (ROC) curve is a plot of the pairs of true positive rates (y-axis) and false positive rates (x-axis) that result from lowering the threshold down from 1, all the way to 0.

In [None]:
y_pred_proba = classifier.predict_proba(X_test)
pos_proba = y_pred_proba[:,1]

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pos_proba)
plt.plot(fpr, tpr, '*-')
plt.plot([0, 1], [0, 1], 'r--')
plt.legend(['Logistic regression', 'Random chance'])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve for Random Forest Classifier')

In [None]:
# this score helps in identifying efficiency of classifier
roc_auc_score(y_test, pos_proba)

* Values closer to 1 in roc_auc_score shows that classifier is efficient and gives better performance.

Author: Purvit Vashishtha