In [29]:
import pandas as pd

In [30]:
data = pd.read_csv("/content/sample_data/FakeNewsNet.csv")

data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [31]:
data = data.dropna()

In [32]:
X = data.drop("real", axis=1)
y = data["real"]

In [33]:
X.shape

(22866, 4)

In [34]:
y.shape

(22866,)

In [35]:
import tensorflow as tf

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [36]:
# taking a random vocabulary size
voc_size = 5000

In [37]:
# onehot representation
messages = X.copy()

In [38]:
messages.reset_index(inplace=True)

In [39]:
import nltk
import re
from nltk.corpus import stopwords

In [40]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
# PorterStemmer reduces a word to its root form. For example:
# "loved" → "love"
# "running" → "run"

# re
# Python’s built-in Regular Expressions module.
# Used for text cleaning.

# stopwords.words('english')
# Also from NLTK, this is a list of common words that are filtered out (like the, is, in, etc.)
# because they don’t carry significant meaning for text classification tasks.

# corpus = []
# We're initializing an empty list corpus, which will store all the cleaned and preprocessed titles from the dataset.

# This is classic text preprocessing — making data clean, uniform, and easier for a machine learning model to learn from.

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

corpus = []

for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['title'][i]) # substiting all chars other than azAZ to blank
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [43]:
corpus[0:5]

['kandi burruss explod rape accus real housew atlanta reunion video',
 'peopl choic award best red carpet look',
 'sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva',
 'colombian singer maluma spark rumour inappropri relationship aunt',
 'gossip girl year later upper east sider shock world chang pop cultur forev']

In [44]:
one_hot_repr = [one_hot(words, voc_size) for words in corpus]

sentence_length = 20
embedded_docs = pad_sequences(one_hot_repr, padding="pre", maxlen=sentence_length)
embedded_docs[0:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  737,
         199, 1368, 1025, 3397, 4148, 3365, 2582, 2031, 2597],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   54, 2596, 4679,  525, 3784, 2015, 4504],
       [   0,    0,    0,    0,    0, 3550, 2725, 2148, 1356, 1745, 2700,
        1158, 1238, 2947, 4047, 3829, 2705, 3614,   32, 4396],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 1598, 2566, 3262, 4300, 1434, 3080, 3593, 2103],
       [   0,    0,    0,    0,    0,    0,    0,  482,  258,  239,  949,
        3093, 1668, 4238, 3670, 3056, 3495, 3351, 2163, 4707]],
      dtype=int32)

In [45]:
embedding_vector_features = 40 # why 40? and what is this
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sentence_length))
model.add(LSTM(100))
model.add(Dense(1, activation = "sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()



In [46]:
import numpy as np

X_final = np.array(embedded_docs)
y_final = np.array(y)

In [47]:
X_final.shape, y_final.shape

((22866, 20), (22866,))

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [49]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 49ms/step - accuracy: 0.7673 - loss: 0.5360 - val_accuracy: 0.8277 - val_loss: 0.4008
Epoch 2/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 44ms/step - accuracy: 0.8543 - loss: 0.3412 - val_accuracy: 0.8285 - val_loss: 0.3929
Epoch 3/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 48ms/step - accuracy: 0.8706 - loss: 0.3078 - val_accuracy: 0.8249 - val_loss: 0.4136
Epoch 4/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 42ms/step - accuracy: 0.8823 - loss: 0.2747 - val_accuracy: 0.8215 - val_loss: 0.4303
Epoch 5/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 45ms/step - accuracy: 0.8970 - loss: 0.2492 - val_accuracy: 0.8166 - val_loss: 0.4566
Epoch 6/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 42ms/step - accuracy: 0.9088 - loss: 0.2224 - val_accuracy: 0.8047 - val_loss: 0.5248
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x7f1fddf83f50>

In [55]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sentence_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])



In [56]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 48ms/step - accuracy: 0.7670 - loss: 0.5496 - val_accuracy: 0.8284 - val_loss: 0.4057
Epoch 2/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 50ms/step - accuracy: 0.8452 - loss: 0.3546 - val_accuracy: 0.8332 - val_loss: 0.3926
Epoch 3/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 39ms/step - accuracy: 0.8595 - loss: 0.3261 - val_accuracy: 0.8248 - val_loss: 0.4019
Epoch 4/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 41ms/step - accuracy: 0.8748 - loss: 0.2908 - val_accuracy: 0.8284 - val_loss: 0.4142
Epoch 5/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - accuracy: 0.8865 - loss: 0.2655 - val_accuracy: 0.8256 - val_loss: 0.4507
Epoch 6/10
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 45ms/step - accuracy: 0.8978 - loss: 0.2392 - val_accuracy: 0.8264 - val_loss: 0.4642
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x7f1fd8257e90>

In [53]:
y_pred = model.predict(X_test)

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)