In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_text
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
file_path = 'cleaned_balanced_dataset_FINAL.csv'  # Update this with the correct file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Data loaded successfully. Here are the first few rows:")
print(data.head())


Data loaded successfully. Here are the first few rows:
   label                                            comment
0      1                                               need
1      0                               might well milk last
2      1                                       ask locktrap
3      1  im glad community doesnt make console player f...
4      0                                    joke put stitch


In [2]:
# Assuming 'label' is the target variable and the rest are features
X = data.drop('label', axis=1)
y = data['label']

# Handle missing values in the 'comment' column
X['comment'].fillna('', inplace=True)

# If the features are text, vectorize them
vectorizer = CountVectorizer()
X_vect = vectorizer.fit_transform(X['comment'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.3, random_state=42)

print("Data preprocessed successfully. Shape of training data:", X_train.shape)


Data preprocessed successfully. Shape of training data: (91021, 53030)


In [None]:
# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
# Train the model with the best parameters
clf_best = DecisionTreeClassifier(**best_params, random_state=42)
clf_best.fit(X_train, y_train)
print("Model trained successfully with best parameters.")


In [None]:
# Make predictions
y_pred = clf_best.predict(X_test)

# Assess the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


In [None]:
# Visualize the Decision Tree
plt.figure(figsize=(20,10))
plot_tree(clf_best, filled=True, feature_names=vectorizer.get_feature_names_out(), class_names=['0', '1'], rounded=True)
plt.show()

# Display the tree as text
tree_rules = export_text(clf_best, feature_names=list(vectorizer.get_feature_names_out()))
print("Decision Tree Rules:")
print(tree_rules)
