In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix


In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Map original labels to binary values
label_mapping = {'__label__0': 0, '__label__1': 1}
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)

# Ensure text column is string type and handle missing values
train_data['text'] = train_data['text'].fillna('').astype(str)
test_data['text'] = test_data['text'].fillna('').astype(str)


In [3]:
# Define a simple text preprocessing function
def preprocess_text(text):
    return text.lower()

# Apply preprocessing
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)


In [4]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000)

# Vectorize the training data
X_train_vec = vectorizer.fit_transform(train_data['text'])
X_test_vec = vectorizer.transform(test_data['text'])

# Extract labels
y_train = train_data['label']
y_test = test_data['label']


In [5]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_vec_smote, y_train_smote = smote.fit_resample(X_train_vec, y_train)

# Convert to CSR matrix
X_train_vec_smote = csr_matrix(X_train_vec_smote)


In [6]:
# Define the function to evaluate the model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    return accuracy

In [7]:
# Set up Random Forest hyperparameter tuning
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search with Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3, n_jobs=-1)
rf_grid_search.fit(X_train_vec_smote, y_train_smote)

# Get the best Random Forest model
best_rf = rf_grid_search.best_estimator_
print(f"Best Random Forest parameters: {rf_grid_search.best_params_}")

Best Random Forest parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [8]:
# Evaluate the best Random Forest model
benchmark_score = evaluate_model(best_rf, X_train_vec_smote, y_train_smote, X_test_vec, y_test)

print(f"Benchmark score (Random Forest): {benchmark_score:.4f}")


Accuracy: 0.8809
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      3298
           1       0.84      0.65      0.73      1103

    accuracy                           0.88      4401
   macro avg       0.87      0.80      0.83      4401
weighted avg       0.88      0.88      0.88      4401

Benchmark score (Random Forest): 0.8809
