# Fake News Detection System

In [1]:
!pip install numpy pandas scikit-learn nltk joblib



In [2]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import re
import os
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df_fake = pd.read_csv('Fake.csv', engine='python', on_bad_lines='skip')
df_true = pd.read_csv('True.csv', engine='python', on_bad_lines='skip')

df_fake['label'] = 1
df_true['label'] = 0

df = pd.concat([df_fake, df_true]).reset_index(drop=True)

print("Dataset shape:", df.shape)
print("Label distribution:\n", df['label'].value_counts())

Dataset shape: (3221, 5)
Label distribution:
 label
0    1617
1    1604
Name: count, dtype: int64


In [4]:
df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [5]:
df['content'] = df['content'].apply(clean_text)

In [6]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['content'])
y = df['label']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

In [9]:
print("Model Accuracies:")
print(f"Logistic Regression: {acc_lr}")
print(f"Decision Tree: {acc_dt}")
print(f"Random Forest: {acc_rf}")

Model Accuracies:
Logistic Regression: 0.9968992248062015
Decision Tree: 0.9906976744186047
Random Forest: 1.0


In [10]:
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))



Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       298
           1       1.00      1.00      1.00       347

    accuracy                           1.00       645
   macro avg       1.00      1.00      1.00       645
weighted avg       1.00      1.00      1.00       645


Confusion Matrix:
[[298   0]
 [  0 347]]

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       298
           1       1.00      1.00      1.00       347

    accuracy                           1.00       645
   macro avg       1.00      1.00      1.00       645
weighted avg       1.00      1.00      1.00       645


Confusion Matrix:
[[298   0]
 [  0 347]]


In [11]:
os.makedirs('models', exist_ok=True)
joblib.dump(rf, 'models/final_model.pkl')
joblib.dump(vectorizer, 'models/vectorizer.pkl')

print("\nModel and vectorizer saved in 'models/' directory!")


Model and vectorizer saved in 'models/' directory!


In [12]:
sample_text = "Breaking news: Scientists find a new planet in the solar system!"

def preprocess(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [13]:
cleaned = preprocess(sample_text)
X_sample = vectorizer.transform([cleaned])
prediction = rf.predict(X_sample)

print("\nSample Prediction:")
print(f"Text: {sample_text}")
print("Prediction:", "Fake" if prediction[0] == 1 else "Real")



Sample Prediction:
Text: Breaking news: Scientists find a new planet in the solar system!
Prediction: Real
