In [None]:
!pip install -q scikit-learn pandas matplotlib seaborn nltk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [None]:
# example: download McIntire's CSV from GitHub (replace URL if you use another dataset)

df = pd.read_csv('dataset/fake_and_real_news_dataset.csv')
df.head()


In [None]:
print("shape:", df.shape)
df['label'] = df['label'].map({'REAL':0, 'FAKE':1})  # adjust based on dataset labels
print(df.label.value_counts())
 # show some examples
for i, row in df.sample(3).iterrows():
  print("----")
  print("label:", row['label'])
  print(row['text'][:400])

In [None]:
def clean_text(text):
  if not isinstance(text, str):
    return ""
  text = text.lower()
  text = re.sub(r'\n', ' ', text)
  text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
  text = re.sub(r'[^a-z0-9 ]', ' ', text)
# keep alphanum
  tokens = [t for t in text.split() if t not in STOPWORDS and len(t)>1]
  return " ".join(tokens)
df['clean'] = df['text'].apply(clean_text)

In [None]:
X = df['clean']
y = df['label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
tf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tf.fit_transform(X_train)
X_val_tfidf = tf.transform(X_val)
X_test_tfidf = tf.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_val_tfidf)
print("VAL accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, digits=4))

In [None]:
y_test_pred = model.predict(X_test_tfidf)
print("TEST accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4))
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=['REAL','FAKE'], yticklabels=['REAL','FAKE'])
plt.xlabel('predicted'); plt.ylabel('true'); plt.title('Confusion matrix')
plt.show()

In [None]:
# Function to make prediction on custom news text
def predict_news(text):
# Clean the input text using the same preprocessing function used earlier
cleaned_text = clean_text(text)
# Transform the cleaned text using the trained TF-IDF vectorizer
vectorized = tf.transform([cleaned_text])
# Predict with the trained model
prediction = model.predict(vectorized)[0]
# Return result
return "FAKE NEWS" if prediction == 1 else "REAL NEWS"
# Asking user to input a news text
test_news = input("Enter a news headline or article to check if it's Fake or Real:\n")
print("\nPrediction:", predict_news(test_news))