In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix


In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Map original labels to binary values
label_mapping = {'__label__0': 0, '__label__1': 1}
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)

# Ensure text column is string type and handle missing values
train_data['text'] = train_data['text'].fillna('').astype(str)
test_data['text'] = test_data['text'].fillna('').astype(str)


In [3]:
# Define a simple text preprocessing function
def preprocess_text(text):
    return text.lower()

# Apply preprocessing
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)


In [4]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000)

# Vectorize the training data
X_train_vec = vectorizer.fit_transform(train_data['text'])
X_test_vec = vectorizer.transform(test_data['text'])

# Extract labels
y_train = train_data['label']
y_test = test_data['label']


In [5]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_vec_smote, y_train_smote = smote.fit_resample(X_train_vec, y_train)

# Convert to CSR matrix
X_train_vec_smote = csr_matrix(X_train_vec_smote)


In [6]:
# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": MultinomialNB(),
    "SGD Classifier": SGDClassifier(max_iter=1000, random_state=42)
}

# Store accuracy results
results = {}

In [7]:
# Function to evaluate models
def evaluate_model(model_name, model, X_train_vec, y_train, X_test_vec, y_test):
    print(f"\n--- {model_name} ---")
    try:
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name] = accuracy
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    except Exception as e:
        print(f"Error evaluating {model_name}: {e}")

In [8]:
# Train and evaluate models
for model_name, model in models.items():
    evaluate_model(model_name, model, X_train_vec_smote, y_train_smote, X_test_vec, y_test)

# Determine and print the best model
if results:
    best_model = max(results, key=results.get)
    print(f"\nThe best model is: {best_model} with an accuracy of {results[best_model]:.4f}")
else:
    print("No models were evaluated. Please check the data and preprocessing steps.")



--- Random Forest ---
Accuracy: 0.8773
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      3298
           1       0.83      0.64      0.72      1103

    accuracy                           0.88      4401
   macro avg       0.86      0.80      0.82      4401
weighted avg       0.87      0.88      0.87      4401


--- Logistic Regression ---
Accuracy: 0.8160
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3298
           1       0.62      0.71      0.66      1103

    accuracy                           0.82      4401
   macro avg       0.76      0.78      0.77      4401
weighted avg       0.83      0.82      0.82      4401


--- Support Vector Machine ---
Accuracy: 0.8025
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      3298
           1       0.59      0.72

### Adding Hyperparameter Tuning

In [9]:
from sklearn.model_selection import GridSearchCV


In [10]:
param_grid = {
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    "Support Vector Machine": {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7]
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "Naive Bayes": {
        'alpha': [0.01, 0.1, 1, 10, 100]
    },
    "SGD Classifier": {
        'loss': ['hinge', 'log', 'perceptron'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'max_iter': [1000, 2000]
    }
}


In [11]:
def tune_hyperparameters(model_name, model, param_grid, X_train_vec, y_train):
    print(f"\n--- Tuning {model_name} ---")
    try:
        grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train_vec, y_train)
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        return grid_search.best_estimator_
    except Exception as e:
        print(f"Error tuning {model_name}: {e}")
        return model


In [12]:
tuned_results = {}

for model_name, model in models.items():
    tuned_model = tune_hyperparameters(model_name, model, param_grid, X_train_vec_smote, y_train_smote)
    evaluate_model(model_name, tuned_model, X_train_vec_smote, y_train_smote, X_test_vec, y_test)
    tuned_results[model_name] = results[model_name]


--- Tuning Random Forest ---
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

--- Random Forest ---
Accuracy: 0.8809
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      3298
           1       0.84      0.65      0.73      1103

    accuracy                           0.88      4401
   macro avg       0.87      0.80      0.83      4401
weighted avg       0.88      0.88      0.88      4401


--- Tuning Logistic Regression ---
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

--- Logistic Regression ---
Accuracy: 0.8355
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      3298
           1       0.65      0.73      0.69      1103

    accuracy                           0.84      4401
   macro avg       0.78      0.80      0.79      4401
weighted avg 

In [13]:
if tuned_results:
    best_tuned_model = max(tuned_results, key=tuned_results.get)
    print(f"\nThe best tuned model is: {best_tuned_model} with an accuracy of {tuned_results[best_tuned_model]:.4f}")
else:
    print("No models were evaluated after tuning. Please check the data and preprocessing steps.")



The best tuned model is: Random Forest with an accuracy of 0.8809
