In [271]:
import re
import warnings

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [272]:
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

# Data loading + preprocessing + vectorization

In [273]:
train_df = pd.read_csv("./dataset/train_health_content.csv")
test_df = pd.read_csv("./dataset/test_health_content.csv")

In [274]:
train_df['input_text'] = train_df['title'] + " " + train_df['text']
test_df['input_text'] = test_df['title'] + " " + test_df['text']

In [275]:
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\W', ' ', text)   # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove digits
    text = text.strip()
    return text

In [276]:
train_df['input_text'] = train_df['input_text'].apply(clean_text)
test_df['input_text'] = test_df['input_text'].apply(clean_text)

In [277]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(train_df['input_text'])
X_test_tfidf = vectorizer.transform(test_df['input_text'])

In [278]:
y_train = train_df['label']
y_test = test_df['label']

In [279]:
# List of models to train
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": MultinomialNB(),
}

results = {}

# Loop through models, train, and print evaluation metrics
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    f1 = f1_score(y_test, y_pred)
    print(f'Model: {model_name}')
    print(classification_report(y_test, y_pred, zero_division=0))
    print('\n' + '='*60 + '\n')

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.68      0.98      0.80       299
           1       0.61      0.07      0.13       149

    accuracy                           0.68       448
   macro avg       0.65      0.53      0.47       448
weighted avg       0.66      0.68      0.58       448



Model: Support Vector Machine
              precision    recall  f1-score   support

           0       0.68      0.98      0.80       299
           1       0.64      0.06      0.11       149

    accuracy                           0.68       448
   macro avg       0.66      0.52      0.46       448
weighted avg       0.67      0.68      0.57       448



Model: K-Nearest Neighbors
              precision    recall  f1-score   support

           0       0.70      0.90      0.79       299
           1       0.52      0.21      0.30       149

    accuracy                           0.67       448
   macro avg       0.61      0.56      0