In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [39]:
df = pd.read_csv("dataset/rfnews.csv")
df.head()

Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,REAL
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,REAL
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL


In [40]:
df = df.sample(frac = 1)
df.head()

Unnamed: 0,idd,title,text,label
2509,1Vjab!9qS*,"Poverty Rose in 96% of U.S. House Districts, D...","Posted on November 5, 2016 by Eric Zuesse. Eri...",FAKE
4187,8cJi$bceJV,Insider Leaks Bill’s 2-Word Nickname For Hilla...,Email \n\nNo wonder Bill went elsewhere to ful...,FAKE
628,SwiZPfrN60,The power of Trump’s call to free America from...,Donald Trump voters praise him for 'telling it...,REAL
619,VrVe7cx941,"Pressure Is On Trump, Sanders In Crucial Conte...","Pressure Is On Trump, Sanders In Crucial Conte...",REAL
4482,nEzcgRfFrF,GOP senators warn negotiators: US climate goal...,"November 4, 2016 GOP senators warn negotiators...",FAKE


In [41]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.drop(["idd"], axis = 1, inplace = True)

In [42]:
df.head()

Unnamed: 0,title,text,label
0,"Poverty Rose in 96% of U.S. House Districts, D...","Posted on November 5, 2016 by Eric Zuesse. Eri...",FAKE
1,Insider Leaks Bill’s 2-Word Nickname For Hilla...,Email \n\nNo wonder Bill went elsewhere to ful...,FAKE
2,The power of Trump’s call to free America from...,Donald Trump voters praise him for 'telling it...,REAL
3,"Pressure Is On Trump, Sanders In Crucial Conte...","Pressure Is On Trump, Sanders In Crucial Conte...",REAL
4,GOP senators warn negotiators: US climate goal...,"November 4, 2016 GOP senators warn negotiators...",FAKE


In [43]:
def wordopt(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", '', text)
    text = re.sub(r"\W", " ", text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(rf'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [44]:
df["text"] = df["title"].astype(str) + " " + df["text"]

In [45]:
df["text"] = df["text"].apply(wordopt)

In [46]:
df.head()

Unnamed: 0,title,text,label
0,"Poverty Rose in 96% of U.S. House Districts, D...",poverty rose in of u s house districts dur...,FAKE
1,Insider Leaks Bill’s 2-Word Nickname For Hilla...,insider leaks bill s word nickname for hillar...,FAKE
2,The power of Trump’s call to free America from...,the power of trump s call to free america from...,REAL
3,"Pressure Is On Trump, Sanders In Crucial Conte...",pressure is on trump sanders in crucial conte...,REAL
4,GOP senators warn negotiators: US climate goal...,gop senators warn negotiators us climate goal...,FAKE


In [56]:
df["label"] = df["label"].map({"REAL": 1, "FAKE": 0})

In [57]:
df.head()

Unnamed: 0,title,text,label
0,"Poverty Rose in 96% of U.S. House Districts, D...",poverty rose in of u s house districts dur...,0
1,Insider Leaks Bill’s 2-Word Nickname For Hilla...,insider leaks bill s word nickname for hillar...,0
2,The power of Trump’s call to free America from...,the power of trump s call to free america from...,1
3,"Pressure Is On Trump, Sanders In Crucial Conte...",pressure is on trump sanders in crucial conte...,1
4,GOP senators warn negotiators: US climate goal...,gop senators warn negotiators us climate goal...,0


In [58]:
x = df["text"]
y = df["label"]

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

vectorization = TfidfVectorizer(stop_words="english", max_df=0.7, ngram_range=(1, 2))
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

base_models = [
    ('pa', PassiveAggressiveClassifier(max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]

logistic_regression = LogisticRegression()

stack = StackingClassifier(
    estimators=base_models,
    final_estimator=logistic_regression,
    stack_method='auto'
)

In [61]:
stack.fit(xv_train, y_train)
y_pred = stack.predict(xv_test)

In [62]:
import joblib

joblib.dump(vectorization, 'vectorizer.pkl')
joblib.dump(stack, 'model.pkl')

['model.pkl']

In [63]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

for name, model in base_models:
    model.fit(xv_train, y_train)
    print(f"{name} Accuracy: {accuracy_score(y_test, model.predict(xv_test)):.2%}")

Accuracy: 0.9510337323177367
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       447
           1       0.94      0.97      0.95       472

    accuracy                           0.95       919
   macro avg       0.95      0.95      0.95       919
weighted avg       0.95      0.95      0.95       919

pa Accuracy: 95.43%
rf Accuracy: 90.64%


In [64]:
vec = joblib.load('vectorizer.pkl')
model = joblib.load('model.pkl')

text = ["Breaking news: a man dies in a train crash"]
text = vec.transform(text)
print("Prediction:", model.predict(text)[0])

Prediction: 0
