# Import Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
%matplotlib inline
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import accuracy_score, confusion_matrix
import pyarrow

## Configurations

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

# Import Dataset

In [None]:
raw_df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
raw_df

In [None]:
raw_df.info()

Remove rows for which target column is empty

In [None]:
raw_df.dropna(subset=['RainTomorrow'], inplace=True)

# Train, Validation, Test Split

In [None]:
plt.title('No. of Rows Per Year');
sns.countplot(x=pd.to_datetime(raw_df.Date).dt.year);

While working with chronological data, it's often a good idea to separate the training, validation and test sets with time, so that the model is trained on data from the past and evaluated on data from the future.

We'll use the data till 2014 for the training set, data from 2015 for the validation set, and the data from 2016 & 2017 for the test set.  

In [None]:
year = pd.to_datetime(raw_df.Date).dt.year

train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

# Identify Inputs & Targets Columns

In [None]:
input_cols = list(train_df.columns[1:-1])
target_cols = train_df.columns[-1]

In [None]:
input_cols,target_cols

# Identify Xs & Ys

In [None]:
X_train = train_df[input_cols].copy()
Y_train = train_df[target_cols].copy()
X_val = val_df[input_cols].copy()
Y_val = val_df[target_cols].copy()
X_test = test_df[input_cols].copy()
Y_test = test_df[target_cols].copy()

# Identify Numerical & Categorical Columns

In [None]:
numeric_cols = list(X_train.select_dtypes(include=np.number).columns)
categorical_cols = list(X_train.select_dtypes(include='object').columns)

In [None]:
numeric_cols, categorical_cols

# Impute Missing Values

In [None]:
X_train[numeric_cols].isna().sum().sort_values(ascending=False)

In [None]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(raw_df[numeric_cols])

In [None]:
X_train[numeric_cols] = imputer.transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

In [None]:
X_train[numeric_cols].isna().sum().sort_values(ascending=False)

# Scaling Numeric Features

In [None]:
scaler = MinMaxScaler()
scaler.fit(raw_df[numeric_cols])

In [None]:
X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Encoding Categorical Columns

In [None]:
categorical_cols

***Note :***<br>
Fill Nans with 'Unknown' value in categorical columns

In [None]:
X_train[categorical_cols].isna().sum().sort_values(ascending=False)

In [None]:
X_train[categorical_cols] = X_train[categorical_cols].fillna('Unknown')
X_val[categorical_cols] = X_val[categorical_cols].fillna('Unknown')
X_test[categorical_cols] = X_val[categorical_cols].fillna('Unknown')

In [None]:
X_train[categorical_cols].isna().sum().sort_values(ascending=False)

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_cols])

In [None]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))

In [None]:
encoded_cols

In [None]:
X_train[encoded_cols] = encoder.transform(X_train[categorical_cols])
X_val[encoded_cols] = encoder.transform(X_val[categorical_cols])
X_test[encoded_cols] = encoder.transform(X_test[categorical_cols])

In [None]:
X_train = X_train[numeric_cols + encoded_cols]
X_val = X_val[numeric_cols + encoded_cols]
X_test = X_test[numeric_cols + encoded_cols]

# Training & Visualizing Decision Trees
A decision tree in general parlance represents a hierarchical series of binary decisions:

A decision tree in machine learning works in exactly the same way, and except that we let the computer figure out the optimal structure & hierarchy of decisions, instead of coming up with criteria manually.

## Training

In [None]:
model = DecisionTreeClassifier(random_state = 42)

In [None]:
%%time
model.fit(X_train, Y_train)

## Evaluation

In [None]:
X_train_pred = model.predict(X_train)
X_train_pred

In [None]:
pd.value_counts(X_train_pred)

Seems prediction has more Nos.<br>
This is because the training set is also skewed

In [None]:
train_probs = model.predict_proba(X_train)
train_probs

In [None]:
print('Training Accuracy :',accuracy_score(X_train_pred,Y_train)*100)

The training set accuracy is close to 100%! But we can't rely solely on the training set accuracy, we must evaluate the model on the validation set too. 

We can make predictions and compute accuracy in one step using `model.score`

In [None]:
print('Validation Acuracy :',model.score(X_val,Y_val)*100)

In [None]:
Y_val.value_counts() / len(Y_val)

Although the training accuracy is 100%, the accuracy on the validation set is just about 79%, which is only marginally better then always predicting "No", i.e,predicting always 'No' also gives around 78.8 % accuracy.<br>

This is because of overfitting.<br>
<b>Note :</b><br>
Decision Trees tends to overfit.

## Visualization
We'll visualize the decision tree learned from training data.

In [None]:
plt.figure(figsize=(80,50))
plot_tree(model, feature_names=X_train.columns, max_depth=2, filled=True);

In [None]:
model.tree_.max_depth

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(X_train.columns))
print(tree_text)

# Feature Importance
Decision Trees can find iportance of features by itself.<br>
Below are thew importances of 119 features(total number of features in the training dataset)

In [None]:
model.feature_importances_

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature' : X_train.columns,
    'Importance' : model.feature_importances_
}).sort_values(by='Importance', ascending=False)
feature_importance_df

In [None]:
plt.title('Feature Importance')
sns.barplot(data = feature_importance_df.head(20), x='Importance', y='Feature');

# Hyperparameter Tuning To Reduce Overfitting

The `DecisionTreeClassifier` accepts several arguments, some of which can be modified to reduce overfitting.<br>

- `max_depth`
- `max_leaf_nodes`

## max_depth
By reducing the tree maximum depth can reduce overfitting.<br>
Maximum depth (default) is 48 which is reduced to 3 to reduce overfittting as below.

In [None]:
model.tree_.max_depth

In [None]:
model = DecisionTreeClassifier(random_state=42, max_depth=3)

In [None]:
model.fit(X_train, Y_train)

In [None]:
print('Accuracy in Training Dataset :',model.score(X_train, Y_train)*100)

In [None]:
print('Accuracy in Validation Dataset :',model.score(X_val, Y_val)*100)

#### Visualisation

In [None]:
plt.figure(figsize=(80,50))
plot_tree(model, feature_names=X_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(X_train.columns))
print(tree_text)

### max_depth Tuning

Since the max_depth value without manual constraint for which our model overfitted is 48.<br>
And the max_depth value obviously can't be 0 (or lesser).<br>
So let's find what the best value of max_depth would be by trial and error method and use the max_depth for<br>
which the errors of train and validation dataset is optimal.

In [None]:
def max_depth_accuracy1(max_depth_val):
    model = DecisionTreeClassifier(random_state=42, max_depth=max_depth_val)
    model.fit(X_train, Y_train)
    train_accuracy = model.score(X_train, Y_train)*100
    val_accuracy = model.score(X_val, Y_val)*100
    return {'Max_Depth' : max_depth_val, 'Training_Accuracy' : train_accuracy, 'Validation_Accuracy' : val_accuracy}

In [None]:
%%time
accuracies_df1 = pd.DataFrame([max_depth_accuracy1(i) for i in range(1,48)])
accuracies_df1

#### Save accuracies_df1 dataframe

In [None]:
accuracies_df1.to_parquet('Accuracies_max_depth_tuning1.parquet')

#### Load saved accuracies_df1

In [None]:
accuracies_df1 = pd.read_parquet('Accuracies_max_depth_tuning1.parquet')

In [None]:
accuracies_df1

From the dataframe, it can be seen that the training accuracy increases with increase in max_depth.<br>
It is also to be noted that validation accuracy first increases and then decreases.<br>

##### Plotting Tuning Graph
Let'us visualise the training accuracy and validation accuracy with different max_depths.<br>

In [None]:
plt.title('Training Accuracy Vs Validation Accuracy');
plt.plot(accuracies_df1['Max_Depth'], accuracies_df1['Training_Accuracy']);
plt.plot(accuracies_df1['Max_Depth'], accuracies_df1['Validation_Accuracy']);
plt.legend(['Training Accuracy', 'Validation Accuracy']);
plt.xticks(range(0,48, 2))
plt.xlabel('Max Depth');
plt.ylabel('Errors');

From the graph it can also be seen that training accuracy increases with increase in max_depth<br>
while validation accuracy first increases (till max_depth = 7) and then decreases.<br>
Therefore, optimal max_depth is 7.

#### Buiild Decision Tree with max_depth = 7

In [None]:
model = DecisionTreeClassifier(random_state=42, max_depth=7)
model.fit(X_train, Y_train)
print('Training Accuracy :', model.score(X_train,Y_train)*100)
print('Validation Accuracy :', model.score(X_val, Y_val)*100)

## max_leaf_nodes
Another way to control the size of complexity of a decision tree is to limit the number of leaf nodes. This allows branches of the tree to have varying depths. 

In [None]:
model = DecisionTreeClassifier(max_leaf_nodes=128, random_state=42)
model.fit(X_train, Y_train)
print('Training Accuracy :', model.score(X_train,Y_train)*100)
print('Validation Accuracy :', model.score(X_val, Y_val)*100)

In [None]:
model.tree_.max_depth

Let's see the accuracies when max_depth was set to 12 while tuning max_depth parameter.<br>
They are not same because number of nodes in that case might be different.

In [None]:
accuracies_df1.loc[accuracies_df1['Max_Depth'] == model.tree_.max_depth]

In [None]:
model_text = export_text(model, feature_names=list(X_train.columns))
print(model_text)

In [None]:
model = DecisionTreeClassifier(max_leaf_nodes=128, random_state=42, max_depth=6)

In [None]:
def max_depth_accuracy2(max_depth_val):
    model = DecisionTreeClassifier(random_state=42, max_depth=max_depth_val, max_leaf_nodes=128)
    model.fit(X_train, Y_train)
    train_accuracy = model.score(X_train, Y_train)*100
    val_accuracy = model.score(X_val, Y_val)*100
    return {'Max_Depth' : max_depth_val, 'Training_Accuracy' : train_accuracy, 'Validation_Accuracy' : val_accuracy}

In [None]:
%%time
accuracies_df2 = pd.DataFrame([max_depth_accuracy2(i) for i in range(1,14)])
accuracies_df2

#### Save accuracies_df2 dataframe

In [None]:
accuracies_df2.to_parquet('Accuracies_max_depth_tuning2.parquet')

#### Load saved accuracies_df2

In [None]:
accuracies_df2 = pd.read_parquet('Accuracies_max_depth_tuning2.parquet')

##### Plotting Tuning Graph
Let'us visualise the training accuracy and validation accuracy with different max_depths and max_leaf_nodes = 128.<br>

In [None]:
plt.title('Training Accuracy Vs Validation Accuracy');
plt.plot(accuracies_df2['Max_Depth'], accuracies_df2['Training_Accuracy']);
plt.plot(accuracies_df2['Max_Depth'], accuracies_df2['Validation_Accuracy']);
plt.legend(['Training Accuracy', 'Validation Accuracy']);
plt.xticks(range(0,16, 2))
plt.xlabel('Max Depth');
plt.ylabel('Errors');

It seems max_depth = 9 and max_leaf_nodes = 128 is the optimal hyperparameters

In [None]:
model = DecisionTreeClassifier(max_depth=9, max_leaf_nodes=128, random_state=42)
model.fit(X_train, Y_train)
print('Training Accuracy :', model.score(X_train,Y_train)*100)
print('Validation Accuracy :', model.score(X_val, Y_val)*100)