# Xgboost Classifier

## Importing the necessary libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib

## Loading the Processed Dataset

In [18]:
data = pd.read_csv("weather_preprocessed.csv")

In [19]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month,Day
0,27,13.4,22.9,0.6,6.4,0.4,4.0,44.0,5.0,3.0,...,1007.7,1007.1,8.0,7.0,16.9,21.8,False,False,12,1
1,27,7.4,25.1,0.0,7.6,11.2,2.0,44.0,0.0,6.0,...,1010.6,1007.8,7.0,7.0,17.2,24.3,False,False,12,2
2,27,12.9,25.7,0.0,8.0,0.7,5.0,46.0,5.0,6.0,...,1007.6,1008.7,7.0,2.0,21.0,23.2,False,False,12,3
3,27,9.2,28.0,0.0,2.6,7.3,11.0,24.0,13.0,14.0,...,1017.6,1012.8,7.0,7.0,18.1,26.5,False,False,12,4
4,27,17.5,32.3,1.0,2.8,0.1,4.0,41.0,12.0,0.0,...,1010.8,1006.0,7.0,8.0,17.8,29.7,False,False,12,5


In [20]:
data.shape

(145460, 24)

In [21]:
df = data.sample(n = 12000)

In [22]:
df.shape

(12000, 24)

## Dividing the dataset into Independent and Dependent features

In [23]:
X = df.drop(["RainTomorrow"], axis=1)
y = df["RainTomorrow"]

### Train test split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, stratify = y, random_state = 0)

In [25]:
y_train

72075     False
116887    False
108521    False
142638     True
9554      False
          ...  
129657     True
9588      False
122880    False
17531     False
144092    False
Name: RainTomorrow, Length: 9600, dtype: bool

## Handling imbalanced dataset

In [26]:
sns.countplot(df["RainTomorrow"])

KeyError: 0

In [27]:
sm=SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_res)))

The number of classes before fit Counter({False: 7578, True: 2022})
The number of classes after fit Counter({False: 7578, True: 7578})


### Xgboost Classifier

In [28]:
xgb = XGBClassifier()
xgb.fit(X_train_res, y_train_res)

In [29]:
y_pred = xgb.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1778  116]
 [ 242  264]]
0.8508333333333333
              precision    recall  f1-score   support

       False       0.88      0.94      0.91      1894
        True       0.69      0.52      0.60       506

    accuracy                           0.85      2400
   macro avg       0.79      0.73      0.75      2400
weighted avg       0.84      0.85      0.84      2400



In [30]:
metrics.plot_roc_curve(xgb, X_test, y_test)
metrics.roc_auc_score(y_test, y_pred, average=None)

AttributeError: module 'sklearn.metrics' has no attribute 'plot_roc_curve'

## Hyperparameter Optimization

In [31]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = ['0.05','0.1', '0.2','0.3','0.5','0.6']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[3,4,5,6,7]

In [32]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'learning_rate': ['0.05', '0.1', '0.2', '0.3', '0.5', '0.6'], 'max_depth': [5, 10, 15, 20, 25, 30], 'subsample': [0.7, 0.6, 0.8], 'min_child_weight': [3, 4, 5, 6, 7]}


In [33]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
classifier = XGBClassifier()

In [34]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
xg_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 25, cv = 3, verbose=2, random_state=100, n_jobs = -1)

In [35]:
xg_random.fit(X_train_res, y_train_res)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [None]:
xg_random.best_params_

{'subsample': 0.6,
 'n_estimators': 400,
 'min_child_weight': 5,
 'max_depth': 25,
 'learning_rate': '0.05'}

In [None]:
xg_random.best_score_

0.8415398985855352

In [36]:
from sklearn.metrics import accuracy_score
y_predict = xg_random.predict(X_test)
print(confusion_matrix(y_test,y_predict))
print('Accuracy score {}'.format(accuracy_score(y_test,y_predict)))
print('Classification report {}'.format(classification_report(y_test,y_predict)))

[[1774  120]
 [ 225  281]]
Accuracy score 0.85625
Classification report               precision    recall  f1-score   support

       False       0.89      0.94      0.91      1894
        True       0.70      0.56      0.62       506

    accuracy                           0.86      2400
   macro avg       0.79      0.75      0.77      2400
weighted avg       0.85      0.86      0.85      2400



## Saving the model to reuse it again

In [38]:
joblib.dump(xgb, "xgboostmodel.pkl")

['xgboostmodel.pkl']