In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the train and test data
train_file_path = 'train_data.csv'
test_file_path = 'test_data.csv'

In [3]:
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [4]:
# Reduce dataset size for testing
train_data = shuffle(train_data).sample(frac=0.1, random_state=42)
test_data = shuffle(test_data).sample(frac=0.1, random_state=42)

In [5]:
# the dataset has 'comment' and 'label' columns
X_train = train_data['comment']
y_train = train_data['label']
X_test = test_data['comment']
y_test = test_data['label']

In [6]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
# Scale the data
scaler = StandardScaler(with_mean=False)  # Scaling sparse matrix, with_mean=False is used
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

In [8]:
# Initialize models with faster options
models = {
    "Random Forest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
    "SVM": SVC(kernel='linear', max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=50)
}

In [9]:
# Train and evaluate models
results = {}
accuracies = []

for model_name, model in models.items():
    if model_name == "SVM":
        model.fit(X_train_tfidf_scaled, y_train)
    else:
        model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf_scaled if model_name == "SVM" else X_test_tfidf)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[model_name] = report
    accuracies.append(report['accuracy'])



In [10]:
# Display results
for model_name, metrics in results.items():
    print(f"Results for {model_name}:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['weighted avg']['precision']}")
    print(f"Recall: {metrics['weighted avg']['recall']}")
    print(f"F1 Score: {metrics['weighted avg']['f1-score']}")
    print(f"Support: {metrics['weighted avg']['support']}")
    print("\n")

Results for Random Forest:
Accuracy: 0.5412621359223301
Precision: 0.5518601138003334
Recall: 0.5412621359223301
F1 Score: 0.5193598249523693
Support: 412.0


Results for SVM:
Accuracy: 0.5436893203883495
Precision: 0.544963745409094
Recall: 0.5436893203883495
F1 Score: 0.5414207303618711
Support: 412.0


Results for Naive Bayes:
Accuracy: 0.5461165048543689
Precision: 0.5762918177564628
Recall: 0.5461165048543689
F1 Score: 0.49936431670471254
Support: 412.0


Results for Logistic Regression:
Accuracy: 0.558252427184466
Precision: 0.5647437271873499
Recall: 0.558252427184466
F1 Score: 0.5483616037368917
Support: 412.0


Results for Gradient Boosting:
Accuracy: 0.5533980582524272
Precision: 0.5773012513520179
Recall: 0.5533980582524272
F1 Score: 0.5184957725813576
Support: 412.0




In [None]:
# Plot the accuracy of each model
plt.figure(figsize=(10, 6))
plt.bar(models.keys(), accuracies, color=['blue', 'green', 'red', 'purple', 'orange'])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.show()