In [1]:
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout


from sklearn.metrics import classification_report

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ananyaagrawal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Reading in the data

In [3]:
df = pd.read_csv('data.csv')

print(df.shape)
df.head()

(4009, 4)


Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [4]:
df.dtypes

URLs        object
Headline    object
Body        object
Label        int64
dtype: object

Handling missing values

In [5]:
df.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

In [6]:
df['Body'] = df['Body'].fillna('')

Preparing data to build neural network

In [7]:
df['News'] = df['Headline']+df['Body']

df.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald TrumpImag...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tributeCou...


Removing unneccesary columns

In [8]:
features_dropped = ['URLs', 'Headline', 'Body']
df = df.drop(features_dropped, axis=1)

Applying NLP techniques for preprocessing

In [9]:
ps = PorterStemmer()
def wordopt(text):
    text = re.sub('[^a-zA-Z]', ' ',text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [10]:
df['News'] = df['News'].apply(wordopt) 

df.head()

Unnamed: 0,Label,News
0,1,four way bob corker skewer donald trumpimag co...
1,1,linklat war veteran comedi speak modern americ...
2,1,trump fight corker jeopard legisl agendath feu...
3,1,egypt cheiron win tie pemex mexican onshor oil...
4,1,jason aldean open snl vega tributecountri sing...


Splitting the data

In [11]:
X = df['News']
Y = df['Label']

In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3207,)
(802,)
(3207,)
(802,)


In [13]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(X_train).toarray()
xv_test = vectorization.transform(X_test).toarray()

Building the model

In [14]:
model = Sequential([
    Input(shape=(xv_train.shape[1],)),      
    Dense(256, activation='relu'),
    Dropout(0.3, name='dropout1'),         
    Dense(128, activation='relu'),
    Dropout(0.3, name='dropout2'),
    Dense(1, activation='sigmoid')
])

In [15]:
# Optimizer configuration
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)



In [16]:
# Train
model.fit(xv_train, y_train, 
          epochs=5, 
          batch_size=32, 
          validation_data=(xv_test, y_test)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x324cc4e20>

In [17]:
# Transform the raw text test data using vectorizer
xv_test = vectorization.transform(X_test)  

# Predict
preds = (model.predict(xv_test) > 0.5).astype("int32")

# Evaluate
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       455
           1       0.98      0.99      0.98       347

    accuracy                           0.99       802
   macro avg       0.98      0.99      0.98       802
weighted avg       0.99      0.99      0.99       802



Saving the model

In [18]:
# Save the trained Keras model
tf.keras.models.save_model(model, "fake_news_model.keras")

# Save the TfidfVectorizer
import pickle
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorization, f)