# Decision Tree Classification

## Importing the libraries

In [56]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score

## Importing the dataset

In [2]:
# Importing dataset
dataset = pd.read_csv(r"C:\Users\55149\Documents\Part 3 - Classification\data_all.csv", index_col=0)

# Creating column for detecting rain events (1) and no rain (0)
dataset['rain'] = 0

# Rain events only occur if total rain is greater than 0.2 mm
mask = dataset.total_rain > 0.2
rain = dataset.loc[mask].index
dataset.loc[rain, 'rain'] = 1

# Selecting columns of interest
dataset = dataset[['rain_class', 'rain'] + [str(i) for i in range(51)]]

# Set the random seed for reproducibility
np.random.seed(42)

# Filter the dataset to select rows where 'rain' column is equal to zero
zero_rain_rows = dataset[dataset['rain'] == 0]

# Randomly select 766 rows from the zero_rain_rows
selected_rows = zero_rain_rows.sample(n=766)

# Combine the selected rows with other rows where 'rain' column is equal to one
final_dataset = pd.concat([selected_rows, dataset[dataset['rain'] == 1]])

# Print the value counts of 'rain_class' in the final dataset
print(final_dataset['rain_class'].value_counts())

dataset = final_dataset

no rain     766
moderate    317
light       301
heavy       142
violent      10
Name: rain_class, dtype: int64


## Hyperparameter tuning with GridSearchCV

In [53]:
# Selecting X and y variables
# X corresponds to power spectrum density values
# y corresponds to total rain
X = dataset[dataset.columns[2:]].iloc[:, :].values
y = dataset.rain.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [20, 50, 100, 300],
    'learning_rate': [0,75, 0.25, 0.1, 0.01],
    'max_depth': [3, 5, 7, 9],
    'max_features': [3, 5, 7, 10]
}

# Create the gradient boosting classifier
gb_clf = GradientBoostingClassifier()

# Create GridSearchCV object
grid_search = GridSearchCV(gb_clf, param_grid, cv=5, scoring='accuracy')

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'learning_rate': 0.25, 'max_depth': 7, 'max_features': 3, 'n_estimators': 20}
Best Score:  0.9748277809147374


## T

In [57]:
# Define a list to store accuracy values
accuracy_values = []
false_positive_values = []
false_negative_values = []
precision_values = []

# Define the number of iterations
num_iterations = 100

for _ in range(num_iterations):
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
    
    # Applying Gradient Boosting Classifier
    classifier = GradientBoostingClassifier(learning_rate=0.25, max_depth=7, max_features=3, n_estimators=20)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    # Get the list of class labels
    class_labels = dataset['rain_class'].unique()
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    false_positive = cm[0][1]  # False positives are in the first row, second column
    false_negative = cm[1][0]  # False negatives are in the second row, first column
    
    precision = precision_score(y_test, y_pred)  # Calculate precision
    
    accuracy_values.append(accuracy)
    false_positive_values.append(false_positive)
    false_negative_values.append(false_negative)
    precision_values.append(precision)

# Calculate the average accuracy
average_accuracy = np.mean(accuracy_values)

# Calculate the average precision
average_precision = np.mean(precision_values)

# Calculate the average false positives and false negatives
average_false_positive = np.mean(false_positive_values)
average_false_negative = np.mean(false_negative_values)

# Print the average accuracy, false positives, false negatives, and precision
print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average False Positives:", average_false_positive)
print("Average False Negatives:", average_false_negative)

Average Accuracy: 0.9679947916666667
Average Precision: 0.9580392300425308
Average False Positives: 8.29
Average False Negatives: 4.0


In [60]:
# After predicting the values using the trained classifier, create a new DataFrame
df_results = pd.DataFrame(X_test, columns=dataset.columns[2:])
df_results['predicted'] = y_pred
df_results['actual'] = y_test

# Get the indices of misclassified rows
misclassified_df = df_results[df_results['predicted'] != df_results['actual']]

In [61]:
misclassified_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,predicted,actual
1,26.803503,35.32747,1525.4327,5899.3135,2234.1924,184.72545,81.17536,115.50692,228.42488,264.08987,...,170.60463,137.61133,94.24423,72.64426,62.04622,50.863914,44.866432,38.228294,1,0
31,4.137597,3.8387,48.78822,372.848,332.7042,33.392017,6.906498,10.658554,50.05287,74.56753,...,12.929351,11.506581,10.448172,9.834924,8.857118,7.740404,6.730504,6.04856,0,1
58,4.097923,6.857848,194.160355,826.210815,344.221527,31.922941,13.449718,22.968195,48.732937,77.582993,...,24.291454,23.280214,23.481419,23.368423,21.861965,18.67425,16.542467,15.177099,0,1
139,20.892109,19.467464,489.26837,2760.0364,1528.8889,110.87947,66.37268,158.35374,291.60468,294.30164,...,26.927492,22.884241,19.942509,16.439856,14.472115,12.850518,11.862374,11.204401,0,1
151,37.78154,23.449396,454.02924,3542.6216,2723.3616,235.77727,52.631847,81.71099,245.34659,487.82007,...,51.51901,43.70553,34.92937,30.71181,28.382023,24.757416,21.94479,19.580286,1,0
203,66.781685,55.289986,1781.905396,10387.28125,5497.972656,338.695404,120.944382,180.021057,194.078644,269.786926,...,17.027248,31.904074,52.096306,66.782982,82.563599,102.435799,111.506859,103.013588,1,0
237,119.22978,166.80327,3286.8242,7332.133,8201.739,1944.7749,226.63605,159.18546,444.04333,1217.0308,...,119.32306,179.6063,179.15337,168.15047,161.03532,144.57346,131.77579,141.61896,1,0
249,64.338554,83.33626,2249.9905,9949.998,4365.5083,332.24268,131.78073,193.26733,468.58615,961.34576,...,61.802944,61.367565,44.413944,35.66549,31.328625,28.474276,27.724882,26.920805,1,0
274,11.117081,9.675589,338.8332,1864.327,943.6875,271.35904,127.38571,401.83072,674.18445,309.454,...,30.072775,51.35771,59.95202,58.175816,60.77736,62.773376,61.302666,54.113377,1,0
339,35.406567,32.099865,956.1218,5322.316,2719.7559,438.22968,127.91238,65.98404,117.80872,206.65865,...,155.32143,160.39914,164.96443,169.7373,148.87672,109.93174,85.98491,73.87575,1,0
