In [78]:
! pip install -U scikit-learn
! pip install matplotlib
! pip install pandas
! pip install numpy
! pip install scikit-plot
! pip install scipy==1.11.4



In [79]:
import pickle
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scikitplot as skplt
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay

## Load in Data

### TF-IDF

In [80]:
with open('/Users/shreyanakum/Documents/CS178/CS-178-Project/DATA/tfidf_data.pkl', 'rb') as f:
    tfidf_data = pickle.load(f)

X_train_tfidf = tfidf_data['X_train']
X_test_tfidf = tfidf_data['X_test']
y_train_tfidf = tfidf_data['y_train']
y_test_tfidf = tfidf_data['y_test']
vectorizer_tfidf = tfidf_data['vectorizer']

### Count Vectorizer

In [81]:
with open('/Users/shreyanakum/Documents/CS178/CS-178-Project/DATA/count_data.pkl', 'rb') as f:
    count_data = pickle.load(f)

X_train_count = count_data['X_train']
X_test_count = count_data['X_test']
y_train_count = count_data['y_train']
y_test_count = count_data['y_test']
vectorizer_count = count_data['vectorizer']

In [82]:
classes = ['toxic', 'severe_toxic', 'insult', 'threat', 'obscene', 'identity_hate']

### TFIDF only because percepton revealed that is the best one

In [83]:
# Define the parameter grid for GridSearch
param_grid = {
    # 'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_leaf': [2, 5, 10],
    'min_samples_split': [2, 5, 10],
}

# Initialize the base decision tree
dt_base = DecisionTreeClassifier(random_state=42)

In [None]:
# Perform GridSearchCV with cross-validation

grid_search = GridSearchCV(
    estimator=dt_base,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search
grid_search.fit(tfidf_data['X_train'], tfidf_data['y_train'])

# Print best parameters and best score
print("Best hyperparameters from Grid Search:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_}")

# Get the best model
best_dt = grid_search.best_estimator_

# Cross-validation scores on training data
print('\n')
print("Cross validation scores (5-fold)")
cv_scores = cross_val_score(best_dt, tfidf_data['X_train'], tfidf_data['y_train'], cv=5, scoring='f1_weighted')
print(f"CV F1 Scores: {cv_scores}")
print(f"Mean CV F1 Score: {cv_scores.mean()} (+/- {cv_scores.std() * 2})")

# Make predictions on test set
y_pred = best_dt.predict(tfidf_data['X_test'])

# weighted F1 score
test_f1 = f1_score(tfidf_data['y_test'], y_pred, average='weighted')
print("Test set performance")
print(f"Test Set Weighted F1 Score: {test_f1:.4f}")

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=   9.8s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=  10.1s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=  13.1s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5; total time=  13.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5; total time=  13.4s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5; total time=  13.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=  14.0s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=  14.0s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5; total time=  10.7s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5; total time=  10.8s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10; total time=  11.6s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10; total time=  11.6s
[C

In [None]:
# Classification report
print('\nClassification report:')
print(classification_report(tfidf_data['y_test'], y_pred))

# Create confusion matrix
print('\nConfusion matrix:')
cm = confusion_matrix(tfidf_data['y_test'], y_pred)
print(cm)

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Decision Tree (Best Model)', fontsize=14, pad=20)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.show()

# Store results in your original format if needed
acc_tfidfs = [test_f1]
preds_tfidfs = {'best_model': y_pred}

In [None]:
grid_search = GridSearchCV(
    estimator=dt_base,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)

# fit the grid search -> takes very very long
grid_search.fit(count_data['X_train'], count_data['y_train'])

# best parameters and best score
print("Best hyperparameters from Grid Search:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_}")

best_dt = grid_search.best_estimator_

# cv scores on training data
print('\n')
print("Cross validation scores (5-fold)")
cv_scores = cross_val_score(best_dt, count_data['X_train'], count_data['y_train'], cv=5, scoring='f1_weighted')
print(f"CV F1 Scores: {cv_scores}")
print(f"Mean CV F1 Score: {cv_scores.mean()} (+/- {cv_scores.std() * 2})")

# make predictions on test set
y_pred = best_dt.predict(count_data['X_test'])

# weighted F1 score
test_f1 = f1_score(count_data['y_test'], y_pred, average='weighted')
print("Test set performance")
print(f"Test Set Weighted F1 Score: {test_f1:.4f}")

In [None]:
print('\nClassification report:')
print(classification_report(count_data['y_test'], y_pred))

print('\nConfusion matrix:')
cm = confusion_matrix(count_data['y_test'], y_pred)
print(cm)

fig, ax = plt.subplots(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, cmap='Oranges', values_format='d')
plt.title('Confusion Matrix - Decision Tree (Best Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()