#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#### Load Cleaned Datasets

In [2]:
# Load the cleaned datasets
train_df = pd.read_csv('E:\\Vocational\\Lighthouse Labs\\Flex Course\\Projects\\P05_Large Language Models\\llm_project\\data\\cleaned_train.csv.gz', compression='gzip')
test_df = pd.read_csv('E:\\Vocational\\Lighthouse Labs\\Flex Course\\Projects\\P05_Large Language Models\\llm_project\\data\\cleaned_test.csv.gz', compression='gzip')

# Extract labels
y_train = train_df['label']
y_test = test_df['label']

#### Generate TF-IDF and BoW Representations

In [3]:
# Initialize vectorizers
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
bow_vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the train datasets, and transform the test datasets
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['clean_text'])

X_train_bow = bow_vectorizer.fit_transform(train_df['clean_text'])
X_test_bow = bow_vectorizer.transform(test_df['clean_text'])

#### Define a Function to Train and Evaluate Models

In [4]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print('Confusion Matrix:')
    print(cm)
    print('Classification Report:')
    print(report)
    
    return accuracy, precision, recall, f1, cm, report

#### Train and Evaluate Models for TF-IDF

In [5]:
print("TF-IDF Representation")

# Logistic Regression
print("Logistic Regression")
lr_model = LogisticRegression(max_iter=1000)
train_and_evaluate_model(lr_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Random Forest
print("Random Forest")
rf_model = RandomForestClassifier(n_estimators=100)
train_and_evaluate_model(rf_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Gradient Boosting
print("Gradient Boosting")
gb_model = GradientBoostingClassifier(n_estimators=100)
train_and_evaluate_model(gb_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

# SVM
print("SVM")
svm_model = SVC(kernel='linear', probability=True)
train_and_evaluate_model(svm_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

TF-IDF Representation
Logistic Regression
Accuracy: 0.8771420507237612
Precision: 0.8745513996331445
Recall: 0.8815112540192926
F1 Score: 0.8780175347291725
Confusion Matrix:
[[10788  1573]
 [ 1474 10966]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88     12361
           1       0.87      0.88      0.88     12440

    accuracy                           0.88     24801
   macro avg       0.88      0.88      0.88     24801
weighted avg       0.88      0.88      0.88     24801

Random Forest
Accuracy: 0.8425869924599815
Precision: 0.8534282875124213
Recall: 0.8284565916398714
F1 Score: 0.8407570566160875
Confusion Matrix:
[[10591  1770]
 [ 2134 10306]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84     12361
           1       0.85      0.83      0.84     12440

    accuracy                           0.84     24801
   macro avg       0.84  

(0.8695213902665215,
 0.8721494420184377,
 0.8669614147909968,
 0.8695476900749819,
 array([[10780,  1581],
        [ 1655, 10785]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.87      0.87      0.87     12361\n           1       0.87      0.87      0.87     12440\n\n    accuracy                           0.87     24801\n   macro avg       0.87      0.87      0.87     24801\nweighted avg       0.87      0.87      0.87     24801\n')

#### Conclusion:

Logistic Regression and SVM are the top performers with very similar results. If the goal is to minimize false negatives and ensure most positive cases are correctly identified, Logistic Regression would be preferred due to its slightly higher recall. However, if the goal is to have a balanced performance with a slight edge in precision, SVM would be a strong choice.

#### Train and Evaluate Models for BoW

In [6]:
print("BoW Representation")

# Logistic Regression
print("Logistic Regression")
lr_model = LogisticRegression(max_iter=1000)
train_and_evaluate_model(lr_model, X_train_bow, y_train, X_test_bow, y_test)

# Random Forest
print("Random Forest")
rf_model = RandomForestClassifier(n_estimators=100)
train_and_evaluate_model(rf_model, X_train_bow, y_train, X_test_bow, y_test)

# Gradient Boosting
print("Gradient Boosting")
gb_model = GradientBoostingClassifier(n_estimators=100)
train_and_evaluate_model(gb_model, X_train_bow, y_train, X_test_bow, y_test)

# SVM
print("SVM")
svm_model = SVC(kernel='linear', probability=True)
train_and_evaluate_model(svm_model, X_train_bow, y_train, X_test_bow, y_test)

BoW Representation
Logistic Regression
Accuracy: 0.8439982258779888
Precision: 0.8510690587367904
Recall: 0.8351286173633441
F1 Score: 0.8430234917028442
Confusion Matrix:
[[10543  1818]
 [ 2051 10389]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.84     12361
           1       0.85      0.84      0.84     12440

    accuracy                           0.84     24801
   macro avg       0.84      0.84      0.84     24801
weighted avg       0.84      0.84      0.84     24801

Random Forest
Accuracy: 0.8395629208499658
Precision: 0.8466202376075379
Recall: 0.8306270096463022
F1 Score: 0.8385473726922297
Confusion Matrix:
[[10489  1872]
 [ 2107 10333]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12361
           1       0.85      0.83      0.84     12440

    accuracy                           0.84     24801
   macro avg       0.84     

(0.8283536954155074,
 0.837721832439125,
 0.8158360128617363,
 0.8266340867440439,
 array([[10395,  1966],
        [ 2291, 10149]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.82      0.84      0.83     12361\n           1       0.84      0.82      0.83     12440\n\n    accuracy                           0.83     24801\n   macro avg       0.83      0.83      0.83     24801\nweighted avg       0.83      0.83      0.83     24801\n')

#### Overall Comparison:
- **Best Performance**: Logistic Regression with TF-IDF achieved the highest scores in all evaluation metrics.
- **Consistency**: Gradient Boosting showed consistent results with both TF-IDF and BoW, but with slightly better performance using TF-IDF.
- **Model Preference**: Based on the results, TF-IDF representation generally outperformed BoW representation across all models. Logistic Regression and SVM models particularly benefitted from the TF-IDF representation.

### Final Recommendation

For deployment and further optimization, **Logistic Regression with TF-IDF representation** is recommended due to its superior performance across all metrics. SVM with TF-IDF is also a strong contender and can be considered if a slight trade-off in precision is acceptable.