# Catboost Classifier

## Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib

## Loading the Processed Dataset

In [2]:
data = pd.read_csv("weather_preprocessed.csv")

In [3]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month,Day
0,27,13.4,22.9,0.6,6.4,0.4,4.0,44.0,5.0,3.0,...,1007.7,1007.1,8.0,7.0,16.9,21.8,False,False,12,1
1,27,7.4,25.1,0.0,7.6,11.2,2.0,44.0,0.0,6.0,...,1010.6,1007.8,7.0,7.0,17.2,24.3,False,False,12,2
2,27,12.9,25.7,0.0,8.0,0.7,5.0,46.0,5.0,6.0,...,1007.6,1008.7,7.0,2.0,21.0,23.2,False,False,12,3
3,27,9.2,28.0,0.0,2.6,7.3,11.0,24.0,13.0,14.0,...,1017.6,1012.8,7.0,7.0,18.1,26.5,False,False,12,4
4,27,17.5,32.3,1.0,2.8,0.1,4.0,41.0,12.0,0.0,...,1010.8,1006.0,7.0,8.0,17.8,29.7,False,False,12,5


In [4]:
data.shape

(145460, 24)

In [5]:
df = data.sample(n = 12000)

In [6]:
df.shape

(12000, 24)

## Dividing the dataset into Independent and Dependent features

In [7]:
X = df.drop(["RainTomorrow"], axis=1)
y = df["RainTomorrow"]

### Train test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, stratify = y, random_state = 0)

In [9]:
y_train

9216      False
81106     False
96434     False
90579      True
30905     False
          ...  
67104     False
121953    False
24163     False
137081    False
23522     False
Name: RainTomorrow, Length: 9600, dtype: bool

## Handling imbalanced dataset

In [10]:
sns.countplot(df["RainTomorrow"])

KeyError: 0

In [11]:
sm=SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_res)))

The number of classes before fit Counter({False: 7510, True: 2090})
The number of classes after fit Counter({False: 7510, True: 7510})


### Catboost Classifier

In [12]:
cat = CatBoostClassifier(iterations=25, eval_metric = "AUC")
cat.fit(X_train_res, y_train_res)

Learning rate set to 0.5
0:	total: 152ms	remaining: 3.66s
1:	total: 163ms	remaining: 1.87s
2:	total: 171ms	remaining: 1.25s
3:	total: 181ms	remaining: 949ms
4:	total: 189ms	remaining: 757ms
5:	total: 200ms	remaining: 634ms
6:	total: 210ms	remaining: 539ms
7:	total: 220ms	remaining: 468ms
8:	total: 233ms	remaining: 414ms
9:	total: 244ms	remaining: 366ms
10:	total: 254ms	remaining: 323ms
11:	total: 264ms	remaining: 285ms
12:	total: 273ms	remaining: 252ms
13:	total: 283ms	remaining: 222ms
14:	total: 292ms	remaining: 195ms
15:	total: 301ms	remaining: 169ms
16:	total: 311ms	remaining: 146ms
17:	total: 320ms	remaining: 124ms
18:	total: 329ms	remaining: 104ms
19:	total: 339ms	remaining: 84.7ms
20:	total: 348ms	remaining: 66.2ms
21:	total: 357ms	remaining: 48.6ms
22:	total: 366ms	remaining: 31.8ms
23:	total: 375ms	remaining: 15.6ms
24:	total: 384ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x201253d85d0>

In [13]:
y_pred = cat.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1680  197]
 [ 225  298]]
0.8241666666666667
              precision    recall  f1-score   support

       False       0.88      0.90      0.89      1877
        True       0.60      0.57      0.59       523

    accuracy                           0.82      2400
   macro avg       0.74      0.73      0.74      2400
weighted avg       0.82      0.82      0.82      2400



In [14]:
metrics.plot_roc_curve(cat, X_test, y_test)
metrics.roc_auc_score(y_test, y_pred, average=None)

AttributeError: module 'sklearn.metrics' has no attribute 'plot_roc_curve'

## Hyperparameter Optimization

In [15]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = [0.05,0.1, 0.2,0.3,0.5,0.6]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child samples parameters
min_child_samples=[3,4,5,6,7]

In [16]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_samples': min_child_samples}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5, 0.6], 'max_depth': [5, 10, 15, 20, 25, 30], 'subsample': [0.7, 0.6, 0.8], 'min_child_samples': [3, 4, 5, 6, 7]}


In [17]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
cat = CatBoostClassifier()

In [18]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
cat_random = RandomizedSearchCV(estimator = cat, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=100, n_jobs = -1)

In [19]:
cat_random.fit(X_train_res, y_train_res)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


9 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\verma\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\verma\anaconda3\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\verma\anaconda3\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^


0:	learn: 0.4756649	total: 69.6ms	remaining: 1m 2s
1:	learn: 0.4057465	total: 138ms	remaining: 1m 1s
2:	learn: 0.3560498	total: 200ms	remaining: 59.8s
3:	learn: 0.3292489	total: 279ms	remaining: 1m 2s
4:	learn: 0.3115500	total: 353ms	remaining: 1m 3s
5:	learn: 0.2940391	total: 421ms	remaining: 1m 2s
6:	learn: 0.2777543	total: 488ms	remaining: 1m 2s
7:	learn: 0.2648689	total: 560ms	remaining: 1m 2s
8:	learn: 0.2536866	total: 632ms	remaining: 1m 2s
9:	learn: 0.2429996	total: 704ms	remaining: 1m 2s
10:	learn: 0.2311676	total: 773ms	remaining: 1m 2s
11:	learn: 0.2239431	total: 843ms	remaining: 1m 2s
12:	learn: 0.2118942	total: 911ms	remaining: 1m 2s
13:	learn: 0.2021163	total: 982ms	remaining: 1m 2s
14:	learn: 0.1963927	total: 1.05s	remaining: 1m 2s
15:	learn: 0.1853173	total: 1.13s	remaining: 1m 2s
16:	learn: 0.1747046	total: 1.19s	remaining: 1m 1s
17:	learn: 0.1676771	total: 1.26s	remaining: 1m 1s
18:	learn: 0.1619920	total: 1.33s	remaining: 1m 1s
19:	learn: 0.1553925	total: 1.4s	remaini

In [20]:
cat_random.best_params_

{'subsample': 0.7,
 'n_estimators': 900,
 'min_child_samples': 4,
 'max_depth': 10,
 'learning_rate': 0.6}

In [21]:
best_random_grid=cat_random.best_estimator_

In [22]:
from sklearn.metrics import accuracy_score
y_pred = best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy score {}'.format(accuracy_score(y_test,y_pred)))
print('Classification report {}'.format(classification_report(y_test,y_pred)))

[[1704  173]
 [ 241  282]]
Accuracy score 0.8275
Classification report               precision    recall  f1-score   support

       False       0.88      0.91      0.89      1877
        True       0.62      0.54      0.58       523

    accuracy                           0.83      2400
   macro avg       0.75      0.72      0.73      2400
weighted avg       0.82      0.83      0.82      2400



## Saving the model to reuse it again

In [23]:
joblib.dump(cat_random, "catboost_model.pkl")

['catboost_model.pkl']