In [68]:
# 1️⃣ Data Handling & Processing
import pandas as pd
import numpy as np
import os
import re
import pickle
# 5️⃣ Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# 6️⃣ System Utilities
import time
import warnings
warnings.filterwarnings("ignore")

print("✅ All necessary libraries imported successfully!")


✅ All necessary libraries imported successfully!


In [69]:
df1 = pd.read_csv(r'dataset/fake_or_real_news.csv')
df2 = pd.read_csv(r'dataset/news_dataset.csv')

In [70]:
df1.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [71]:
df2.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [72]:
df1['text'] = df1['title'] + " " + df1['text']

In [73]:
df1.drop(columns=['Unnamed: 0','title'],inplace=True)

In [74]:
df1.head()

Unnamed: 0,text,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,Kerry to go to Paris in gesture of sympathy U....,REAL
3,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,The Battle of New York: Why This Primary Matte...,REAL


In [75]:
combined_df = pd.concat([df1, df2], ignore_index=True)

In [76]:
combined_df.isnull().sum()

text     8
label    0
dtype: int64

In [77]:
combined_df.dropna(inplace=True)

In [78]:
combined_df.duplicated().value_counts()

False    8535
True     1521
Name: count, dtype: int64

In [79]:
combined_df.drop_duplicates()

Unnamed: 0,text,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,Kerry to go to Paris in gesture of sympathy U....,REAL
3,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,The Battle of New York: Why This Primary Matte...,REAL
...,...,...
10054,A set of images is being shared on Facebook wi...,FAKE
10055,Barely 48 hours ahead of voting in the assembl...,FAKE
10057,A quote by an impostor Facebook page of Financ...,FAKE
10061,The Bengaluru City Police’s official Twitter h...,FAKE


In [80]:
combined_df['label'].value_counts()

label
FAKE    5035
REAL    5021
Name: count, dtype: int64

In [81]:
X = combined_df.drop('label',axis=1)
y = combined_df['label']

In [82]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [85]:
X_train = X_train["text"].tolist()  # Replace "text" with the actual column name

In [87]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Preprocess text by removing special characters, converting to lowercase, and removing stopwords."""
    if not isinstance(text, str):  
        return ""  
    
    text = re.sub(r'\W', ' ', text).lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# ✅ Ensure X_train and X_test are extracted correctly
if isinstance(X_train, pd.DataFrame):  
    X_train = X_train["text"].tolist()  # Extract text column only if it's a DataFrame
if isinstance(X_test, pd.DataFrame):
    X_test = X_test["text"].tolist()

print(f"✅ Number of training samples: {len(X_train)}")
print(f"✅ Number of testing samples: {len(X_test)}")

# 1. Clean the training texts
X_train_clean = [clean_text(text) for text in X_train]
X_test_clean = [clean_text(text) for text in X_test]

# 2. Tokenizer
vocab_size = 9000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_clean)

# 3. Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_clean)
X_test_sequences = tokenizer.texts_to_sequences(X_test_clean)

# 4. Pad sequences
max_sequence_length = 500
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Final check
print("✅ NLP processing complete.")
print("Shape of processed X_train:", X_train_padded.shape)  # Expected: (num_samples, 500)
print("Shape of processed X_test:", X_test_padded.shape)    # Expected: (num_samples, 500)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Number of training samples: 8044
✅ Number of testing samples: 2012
✅ NLP processing complete.
Shape of processed X_train: (8044, 500)
Shape of processed X_test: (2012, 500)


In [90]:
import pickle
import os

# Define paths
model_dir = "ML/models"
os.makedirs(model_dir, exist_ok=True)  # Ensure the directory exists

# Save preprocessed training & testing data
with open(os.path.join(model_dir, "X_train_padded.pkl"), "wb") as f:
    pickle.dump(X_train_padded, f)

with open(os.path.join(model_dir, "X_test_padded.pkl"), "wb") as f:
    pickle.dump(X_test_padded, f)

with open(os.path.join(model_dir, "y_train.pkl"), "wb") as f:
    pickle.dump(y_train, f)

with open(os.path.join(model_dir, "y_test.pkl"), "wb") as f:
    pickle.dump(y_test, f)

# Save the tokenizer
with open(os.path.join(model_dir, "tokenizer.pkl"), "wb") as f:
    pickle.dump(tokenizer, f)

print(f"✅ Processed data and tokenizer saved in '{model_dir}' directory successfully!")


✅ Processed data and tokenizer saved in 'ML/models' directory successfully!
