In [1]:
import os
import re
import string
import wandb
import evaluate
import pandas as pd
from bs4 import BeautifulSoup
from transformers import TrainingArguments
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
df = pd.read_csv("data.csv")
print(df.head())

                                            headline  clickbait
0  !Sdrawkcab: Missy Elliott, the Beatles and the...          0
1  "Apprentice" contestant sues Trump for defamation          0
2  "Big morale boost": George H.W. Bush tweets im...          0
3  "Bring it on": Students sue Trump administrati...          0
4  "God made me bulletproof," oft-shot rapper Yun...          0


In [3]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    return text
df['headline'] = df['headline'].apply(clean_text)
df

Unnamed: 0,headline,clickbait
0,sdrawkcab missy elliott the beatles and the jo...,0
1,apprentice contestant sues trump for defamation,0
2,big morale boost george hw bush tweets image w...,0
3,bring it on students sue trump administration ...,0
4,god made me bulletproof oftshot rapper yung ma...,0
...,...,...
53024,flip or flop to end in following hosts split,0
53025,groundhog day broadway musical giving away t...,0
53026,scientific racism is on the rise on the righ...,0
53027,the walking dead star to play the punisher i...,0


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['headline'], df['clickbait'], test_size=0.2, random_state=42, stratify=df['clickbait']
)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=200))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6548
           1       0.89      0.81      0.85      4058

    accuracy                           0.89     10606
   macro avg       0.89      0.87      0.88     10606
weighted avg       0.89      0.89      0.89     10606



In [7]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8874
