In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [31]:
fake_news_data = pd.read_csv("../data/Fake.csv")
real_news_data = pd.read_csv("../data/True.csv")
fake_news_data["label"] = 1
real_news_data["label"] = 0

In [32]:
df = pd.concat([fake_news_data, real_news_data])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = df.drop(columns=["subject", "date"])
df["content"] = df["title"] + ' ' + df["text"]

In [36]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    words = text.split()
    stems = [stemmer.stem(word) for word in words if word not in stop_words]
    new_text = ' '.join(stems)
    return new_text

[nltk_data] Downloading package stopwords to /home/sonja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
df['clean content'] = df['content'].apply(clean_text)

In [42]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['clean content'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [43]:
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [45]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:",accuracy)
print("---")
report = classification_report(y_test, y_pred)
print("Report:", report)

Accuracy: 0.9851893095768374
---
Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.98      4270
           1       0.99      0.98      0.99      4710

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



precision: How many predicted positives were actually correct
→ TP / (TP + FP)
→ High precision = low false positives

recall: How many actual positives were correctly predicted
→ TP / (TP + FN)
→ High recall = low false negatives

f1-score: Harmonic mean of precision & recall (balances both)
→ 2 * (precision * recall) / (precision + recall)

support: Number of actual samples in each class