In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load and preprocess data
file_path = 'new_processed_dataset.csv'
data = pd.read_csv(file_path)
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

X = data['tweet']  # Use the 'tweet' column for text data
y = data['class']  # Use the 'class' column for labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Define the pipeline with LogisticRegression
pipeline_tfidf = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define the parameter grid for hyperparameter tuning for LogisticRegression
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline_tfidf, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters for Logistic Regression:", best_params)
print("Best Cross-Validation Score for Logistic Regression:", best_score)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_tfidf = best_model.predict(X_test_tfidf)

print("TF-IDF Encoding with Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))

# Extracting metrics separately
accuracy = accuracy_score(y_test, y_pred_tfidf)
classification_report_dict = classification_report(y_test, y_pred_tfidf, output_dict=True)
precision = classification_report_dict['weighted avg']['precision']
recall = classification_report_dict['weighted avg']['recall']
f1_score = classification_report_dict['weighted avg']['f1-score']

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for Logistic Regression: {'classifier__C': 100, 'classifier__solver': 'newton-cg'}
Best Cross-Validation Score for Logistic Regression: 0.8828056288478452
TF-IDF Encoding with Logistic Regression
Accuracy: 0.8136112814224402
Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.49      0.30       281
           1       0.94      0.84      0.89      3790
           2       0.75      0.81      0.78       822

    accuracy                           0.81      4893
   macro avg       0.64      0.71      0.66      4893
weighted avg       0.87      0.81      0.84      4893

Accuracy: 0.8136112814224402
Precision: 0.867598200057725
Recall: 0.8136112814224402
F1 Score: 0.8352000801127667


In [2]:
from sklearn.ensemble import RandomForestClassifier

# Define the pipeline with RandomForest
pipeline_tfidf_rf = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', RandomForestClassifier())
])

# Define the parameter grid for hyperparameter tuning for RandomForest
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV
grid_search_rf = GridSearchCV(pipeline_tfidf_rf, param_grid_rf, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_rf.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print("Best Parameters for Random Forest:", best_params_rf)
print("Best Cross-Validation Score for Random Forest:", best_score_rf)

# Evaluate the best model on the test set
best_model_rf = grid_search_rf.best_estimator_
y_pred_tfidf_rf = best_model_rf.predict(X_test_tfidf)

print("TF-IDF Encoding with Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf_rf))

# Extracting metrics separately
accuracy_rf = accuracy_score(y_test, y_pred_tfidf_rf)
classification_report_dict_rf = classification_report(y_test, y_pred_tfidf_rf, output_dict=True)
precision_rf = classification_report_dict_rf['weighted avg']['precision']
recall_rf = classification_report_dict_rf['weighted avg']['recall']
f1_score_rf = classification_report_dict_rf['weighted avg']['f1-score']

print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_score_rf}")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best Cross-Validation Score for Random Forest: 0.9545954265611257
TF-IDF Encoding with Random Forest
Accuracy: 0.8726752503576538
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.33      0.35       281
           1       0.93      0.92      0.93      3790
           2       0.77      0.83      0.80       822

    accuracy                           0.87      4893
   macro avg       0.69      0.69      0.69      4893
weighted avg       0.87      0.87      0.87      4893

Accuracy: 0.8726752503576538
Precision: 0.8707649682768325
Recall: 0.8726752503576538
F1 Score: 0.8714268701658704
