In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

In [2]:
cols = [
    "id", "label", "statement", "subject", "speaker", "speaker_job",
    "state", "party", "barely_true", "false", "half_true", "mostly_true",
    "pants_on_fire", "context"
]

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)

train_df = pd.read_csv("data/liar/train.tsv", sep='\t', header=None, names=cols)
valid_df = pd.read_csv("data/liar/valid.tsv", sep='\t', header=None, names=cols)
test_df = pd.read_csv("data/liar/test.tsv", sep='\t', header=None, names=cols)

train_df['clean_statement'] = train_df['statement'].apply(preprocess_text)
valid_df['clean_statement'] = valid_df['statement'].apply(preprocess_text)
test_df['clean_statement'] = test_df['statement'].apply(preprocess_text)

In [None]:
def to_binary(label):
    reliable = {'true', 'mostly-true', 'half-true'}
    return int(label.lower() in reliable)

for df in [train_df, valid_df, test_df]:
    df['binary_label'] = df['label'].apply(to_binary)

tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df['clean_statement'])
X_valid = tfidf.transform(valid_df['clean_statement'])
X_test  = tfidf.transform(test_df['clean_statement'])

sia = SentimentIntensityAnalyzer()
get_score = lambda text: sia.polarity_scores(text)['compound']

train_sent = train_df['clean_statement'].apply(get_score).values.reshape(-1, 1)
valid_sent = valid_df['clean_statement'].apply(get_score).values.reshape(-1, 1)
test_sent  = test_df['clean_statement'].apply(get_score).values.reshape(-1, 1)

X_train = hstack([X_train, train_sent])
X_valid = hstack([X_valid, valid_sent])
X_test  = hstack([X_test, test_sent])

y_train = train_df['binary_label']
y_valid = valid_df['binary_label']
y_test  = test_df['binary_label']

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_preds = model.predict(X_valid)
test_preds = model.predict(X_test)

print("Validation Results:")
print(classification_report(y_valid, val_preds, target_names=["Misleading", "Reliable"]))
print("Accuracy:", accuracy_score(y_valid, val_preds))

print("\nTest Results:")
print(classification_report(y_test, test_preds, target_names=["Misleading", "Reliable"]))
print("Accuracy:", accuracy_score(y_test, test_preds))


Validation Results:
              precision    recall  f1-score   support

  Misleading       0.63      0.48      0.54       616
    Reliable       0.61      0.73      0.66       668

    accuracy                           0.61      1284
   macro avg       0.62      0.61      0.60      1284
weighted avg       0.62      0.61      0.61      1284

Accuracy: 0.6129283489096573

Test Results:
              precision    recall  f1-score   support

  Misleading       0.56      0.44      0.49       553
    Reliable       0.63      0.73      0.68       714

    accuracy                           0.61      1267
   macro avg       0.60      0.59      0.59      1267
weighted avg       0.60      0.61      0.60      1267

Accuracy: 0.606156274664562


In [None]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    return " ".join(tokens)

data = pd.read_csv("data/clickbait/clickbait_data.csv")
data['clean_headline'] = data['headline'].apply(preprocess_text)

data['binary_label'] = data['clickbait'].astype(int)

train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df['clean_headline'])
X_valid = tfidf.transform(valid_df['clean_headline'])
X_test  = tfidf.transform(test_df['clean_headline'])

sia = SentimentIntensityAnalyzer()
get_score = lambda text: sia.polarity_scores(text)['compound']

train_sent = train_df['clean_headline'].apply(get_score).values.reshape(-1, 1)
valid_sent = valid_df['clean_headline'].apply(get_score).values.reshape(-1, 1)
test_sent  = test_df['clean_headline'].apply(get_score).values.reshape(-1, 1)

X_train = hstack([X_train, train_sent])
X_valid = hstack([X_valid, valid_sent])
X_test  = hstack([X_test, test_sent])

y_train = train_df['binary_label']
y_valid = valid_df['binary_label']
y_test  = test_df['binary_label']

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_preds = model.predict(X_valid)
test_preds = model.predict(X_test)

print("Validation Results:")
print(classification_report(y_valid, val_preds, target_names=["Non-Clickbait", "Clickbait"]))
print("Accuracy:", accuracy_score(y_valid, val_preds))

print("\nTest Results:")
print(classification_report(y_test, test_preds, target_names=["Non-Clickbait", "Clickbait"]))
print("Accuracy:", accuracy_score(y_test, test_preds))


Validation Results:
               precision    recall  f1-score   support

Non-Clickbait       0.93      0.97      0.95      2345
    Clickbait       0.97      0.93      0.95      2455

     accuracy                           0.95      4800
    macro avg       0.95      0.95      0.95      4800
 weighted avg       0.95      0.95      0.95      4800

Accuracy: 0.9485416666666666

Test Results:
               precision    recall  f1-score   support

Non-Clickbait       0.93      0.97      0.95      2396
    Clickbait       0.97      0.93      0.95      2404

     accuracy                           0.95      4800
    macro avg       0.95      0.95      0.95      4800
 weighted avg       0.95      0.95      0.95      4800

Accuracy: 0.9491666666666667


In [None]:
train_statements = set(train_df['clean_headline'])
test_statements = set(test_df['clean_headline'])
duplicates = train_statements.intersection(test_statements)
print(f"Number of duplicate statements between train and test: {len(duplicates)}")

Number of duplicate statements between train and test: 3


In [12]:
label_counts = data['clickbait'].value_counts()
print(label_counts)

clickbait
0    16001
1    15999
Name: count, dtype: int64


In [13]:
train_headlines = set(train_df['clean_headline'])
test_headlines = set(test_df['clean_headline'])
overlap = train_headlines.intersection(test_headlines)
print(f"Overlap in headlines between train and test: {len(overlap)}")

Overlap in headlines between train and test: 3
