In [1]:
# CELL 1: Imports & Setup
# STEP 1: Import libraries
# ============================================
import pandas as pd
import json
import re
import nltk
import joblib

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# STEP 2: Download stopwords
# ============================================
nltk.download("stopwords")
STOP_WORDS = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# CELL 2: Text Preprocessing
# ============================================
# STEP 3: Text cleaning function
# ============================================
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    words = [w for w in words if w not in STOP_WORDS]
    return " ".join(words)


In [3]:
# CELL 3: Load Dataset
# ============================================
# STEP 4: Load sarcasm dataset
# ============================================
data = []
dataset_path = "/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json"

with open(dataset_path, "r") as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df.head()


Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
# CELL 4: Label Mapping
# ============================================
# STEP 5: Map labels
# 1 → FAKE / Misleading
# 0 → REAL
# ============================================
df["label"] = df["is_sarcastic"].map({
    1: "FAKE",
    0: "REAL"
})

df["label"].value_counts()


label
REAL    14985
FAKE    11724
Name: count, dtype: int64

In [5]:
# CELL 5: Apply Preprocessing
# ============================================
# STEP 6: Clean headlines
# ============================================
df["clean_headline"] = df["headline"].apply(clean_text)


In [6]:
# CELL 6: Feature Engineering (KEY PART)
# STEP 7: FeatureUnion (Word + Char TF-IDF)
# ============================================
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=8000,
    min_df=3,
    sublinear_tf=True
)

char_vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=3,
    sublinear_tf=True
)

vectorizer = FeatureUnion([
    ("word", word_vectorizer),
    ("char", char_vectorizer)
])

X = vectorizer.fit_transform(df["clean_headline"])
y = df["label"]


In [7]:
# CELL 7: Train–Test Split
# ============================================
# STEP 8: Train-test split
# ============================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
# CELL 8: Train Model
# ============================================
# STEP 9: Train Linear SVM
# ============================================
model = LinearSVC(
    C=1,
    class_weight="balanced"
)

model.fit(X_train, y_train)


In [9]:
# CELL 9: Evaluate Model
# ============================================
# STEP 10: Evaluate model
# ============================================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8030700112317484

Classification Report:

              precision    recall  f1-score   support

        FAKE       0.78      0.77      0.77      2346
        REAL       0.82      0.83      0.83      2996

    accuracy                           0.80      5342
   macro avg       0.80      0.80      0.80      5342
weighted avg       0.80      0.80      0.80      5342



In [10]:
# CELL 10: EXPORT MODEL & VECTORIZER
# ============================================
# STEP 11: Save model & vectorizer
# ============================================
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [12]:
# GOOD
X_train = df["headline"].astype(str)
y_train = df["label"]

In [13]:
# Step 2: Correct Training Code (FINAL)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=8000,
        ngram_range=(1, 2),
        stop_words="english"
    )),
    ("clf", LogisticRegression(max_iter=1000))
])

# X_train must be raw text (list / Series of strings)
pipeline.fit(X_train, y_train)

joblib.dump(pipeline, "fake_headline_pipeline.pkl")


['fake_headline_pipeline.pkl']