# Decision Tree Classifier

## Importing the necessary libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib
from scipy.stats import randint

## Loading the Processed Dataset

In [11]:
data = pd.read_csv("weather_preprocessed.csv")

In [12]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month,Day
0,27,13.4,22.9,0.6,6.4,0.4,4.0,44.0,5.0,3.0,...,1007.7,1007.1,8.0,7.0,16.9,21.8,False,False,12,1
1,27,7.4,25.1,0.0,7.6,11.2,2.0,44.0,0.0,6.0,...,1010.6,1007.8,7.0,7.0,17.2,24.3,False,False,12,2
2,27,12.9,25.7,0.0,8.0,0.7,5.0,46.0,5.0,6.0,...,1007.6,1008.7,7.0,2.0,21.0,23.2,False,False,12,3
3,27,9.2,28.0,0.0,2.6,7.3,11.0,24.0,13.0,14.0,...,1017.6,1012.8,7.0,7.0,18.1,26.5,False,False,12,4
4,27,17.5,32.3,1.0,2.8,0.1,4.0,41.0,12.0,0.0,...,1010.8,1006.0,7.0,8.0,17.8,29.7,False,False,12,5


In [13]:
data.shape

(145460, 24)

In [14]:
df=data.sample(n=12000)

In [15]:
df.shape

(12000, 24)

## Dividing the dataset into Independent and Dependent features

In [16]:
X = df.drop(["RainTomorrow"], axis=1)
y = df["RainTomorrow"]

### Train test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, stratify = y, random_state = 0)

In [18]:
y_train

9889     False
37712    False
56537    False
71979     True
5030     False
         ...  
65427     True
19889    False
33713    False
37600    False
89574    False
Name: RainTomorrow, Length: 9600, dtype: bool

## Handling imbalanced dataset

In [19]:
sns.countplot(df["RainTomorrow"])

KeyError: 0

In [20]:
sm=SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_res)))

The number of classes before fit Counter({False: 7538, True: 2062})
The number of classes after fit Counter({False: 7538, True: 7538})


## Decision Tree Classifier

In [21]:
model_dt =  DecisionTreeClassifier(criterion='gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [22]:
model_dt.fit(X_train_res, y_train_res)

In [23]:
y_pred = model_dt.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1499  386]
 [ 193  322]]
0.75875
              precision    recall  f1-score   support

       False       0.89      0.80      0.84      1885
        True       0.45      0.63      0.53       515

    accuracy                           0.76      2400
   macro avg       0.67      0.71      0.68      2400
weighted avg       0.79      0.76      0.77      2400



In [24]:
metrics.plot_roc_curve(model_dt, X_test, y_test)
metrics.roc_auc_score(y_test, y_pred, average=None)

AttributeError: module 'sklearn.metrics' has no attribute 'plot_roc_curve'

## Hyperparameter Optimization

In [25]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X_train_res,y_train_res)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 3}
Best score is 0.8136169506138626


In [26]:
from sklearn.metrics import accuracy_score
y_pred_tree = tree_cv.predict(X_test)
print(confusion_matrix(y_test,y_pred_tree))
print('Accuracy score {}'.format(accuracy_score(y_test,y_pred_tree)))
print('Classification report {}'.format(classification_report(y_test,y_pred_tree)))

[[1519  366]
 [ 230  285]]
Accuracy score 0.7516666666666667
Classification report               precision    recall  f1-score   support

       False       0.87      0.81      0.84      1885
        True       0.44      0.55      0.49       515

    accuracy                           0.75      2400
   macro avg       0.65      0.68      0.66      2400
weighted avg       0.78      0.75      0.76      2400



## Saving the model to reuse it again

In [27]:
joblib.dump(tree_cv, "decision_tree_model.pkl")

['decision_tree_model.pkl']