In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM
from keras.utils import pad_sequences
import nltk
from nltk.stem.snowball import SnowballStemmer
import regex as re
from nltk.tokenize import sent_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords

In [None]:
# download some packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# datasets
df_fake = pd.read_csv("/content/sample_data/Fake.csv")
df_true = pd.read_csv("/content/sample_data/True.csv")

In [None]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
# label them seperately
df_true['status'] = 0
df_fake['status'] = 1

In [None]:
# merge and remove unnecessary columns
df = pd.concat([df_true,df_fake])
df.drop(['subject','text','date'],axis=1,inplace=True)

In [None]:
random_indexes = np.random.randint(0,len(df),len(df))
df = df.iloc[random_indexes].reset_index(drop=True)

In [None]:
pd.set_option('display.max_colwidth', 500)
random = np.random.randint(0,len(df),20)
df.iloc[random]

Unnamed: 0,title,status
3701,Trump says preparing new executive actions to save coal mining,0
29190,Assange says WikiLeaks not trying to influence U.S. election,0
4403,Phoenix Newspaper Breaks The Internet In Brutal Tweetstorm On Former Sheriff Joe Arpaio,1
30198,Trump says his administration committed to Japan's security,0
37111,"Ivanka Trump Tweets About Dead Syrian Children, And Twitter Has The PERFECT Response (TWEETS)",1
19331,Ex Representative Grimm's restaurant partner to plead guilty: lawyer,0
29462,Italy rescues more than 250 migrants in Mediterranean,0
27963,WHY “MODERATE” MUSLIMS DON’T SPEAK OUT: Muslim Shopkeeper Makes Video Wishing Customers “Happy Easter”…Muslim Man Stabs Him To Death [VIDEO],1
26224,WATCH: Fox Anchor Chris Wallace Rips NRA Stooge A New One For Lying About Gun Control,1
43654,The Existential Question Of Whom To Trust,1


In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
status,0


In [None]:
# longest sentence length
def longest_sentence_length(text):
  return len(text.split())

df['maximum_length'] = df['title'].apply(lambda x : longest_sentence_length(x))
print('longest sentence having length -')
max_length = max(df['maximum_length'].values)
print(max_length)

longest sentence having length -
42


In [None]:
# Text cleaning
text_cleaning = "\b0\S*|\b[^A-Za-z0-9]+"

def preprocess_filter(text, stem=False):
  text = re.sub(text_cleaning, " ",str(text.lower()).strip())
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        stemmer = SnowballStemmer(language='english')
        token = stemmer.stem(token)
      tokens.append(token)
  return " ".join(tokens)

In [None]:
from tensorflow.keras.preprocessing.text import one_hot


In [None]:
def one_hot_encoded(text,vocab_size=5000,max_length = 40):
    hot_encoded = one_hot(text,vocab_size)
    return hot_encoded

In [None]:
# word embedding pipeline
def word_embedding(text):
    preprocessed_text=preprocess_filter(text)
    return one_hot_encoded(preprocessed_text)

In [None]:
embedded_features = 40
model = Sequential()
model.add(Embedding(5000,embedded_features,input_length = max_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy',optimizer= 'adam',metrics = ['accuracy'])
print(model.summary())

None


In [None]:
# One hot encoded title
one_hot_encoded_title =df['title'].apply(lambda x : word_embedding(x)).values


In [None]:
# padding to make the size equal of the sequences
padded_encoded_title = pad_sequences(one_hot_encoded_title,maxlen=max_length,padding = 'pre')

In [None]:
# Splitting
X = padded_encoded_title
y = df['status'].values
y = np.array(y)

# shapes
print(X.shape)
print(y.shape)

(44898, 42)
(44898,)


array([[   0,    0,    0, ...,    6,  260,  595],
       [   0,    0,    0, ..., 2164,  311,   52],
       [   0,    0,    0, ...,  263, 1610,  893],
       ...,
       [   0,    0,    0, ..., 1418, 2087,  595],
       [   0,    0,    0, ..., 1667, 3162, 1677],
       [   0,    0,    0, ..., 4023, 1513, 3685]], dtype=int32)

In [None]:
# shape and size
print('X shape {}'.format(X.shape))
print('y shape {}'.format(y.shape))

X shape (44898, 42)
y shape (44898,)


In [None]:
# Splitting into training, testing
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state = 42)

# Shape and size of train and test dataset
print('X train shape {}'.format(X_train.shape))
print('X test shape {}'.format(X_test.shape))
print('y train shape {}'.format(y_train.shape))
print('y test shape {}'.format(y_test.shape))

X train shape (33673, 42)
X test shape (11225, 42)
y train shape (33673,)
y test shape (11225,)


In [None]:
# Model training
# training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=15,batch_size=64)

Epoch 1/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 106ms/step - accuracy: 0.8640 - loss: 0.3206 - val_accuracy: 0.9512 - val_loss: 0.1241
Epoch 2/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 75ms/step - accuracy: 0.9691 - loss: 0.0847 - val_accuracy: 0.9615 - val_loss: 0.1087
Epoch 3/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 83ms/step - accuracy: 0.9852 - loss: 0.0470 - val_accuracy: 0.9633 - val_loss: 0.1095
Epoch 4/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 86ms/step - accuracy: 0.9925 - loss: 0.0259 - val_accuracy: 0.9661 - val_loss: 0.1185
Epoch 5/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 76ms/step - accuracy: 0.9950 - loss: 0.0182 - val_accuracy: 0.9673 - val_loss: 0.1408
Epoch 6/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 75ms/step - accuracy: 0.9953 - loss: 0.0152 - val_accuracy: 0.9678 - val_loss: 0.1464
Epoch 7/15
[1m

<keras.src.callbacks.history.History at 0x7d60788cd690>

In [None]:
model.save('fake_news.keras')

In [None]:
from tensorflow.keras.models import load_model

model = load_model('fake_news.h5')


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example function to encode your input text (replace with your actual embedding/tokenizer)
def word_embedding(text):
    # This should convert text to a list of integer tokens
    # For example purpose, let's just do a dummy encoding:
    # Replace this with your real tokenizer or embedding logic
    return [1, 2, 3, 4]  # Dummy tokens

max_length = 100  # Use the max length your model expects

def prediction_input_processing(text):
    encoded = word_embedding(text)
    padded_encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
    output = model.predict(padded_encoded)
    output = np.where(output > 0.4, 1, 0)
    if output[0][0] == 1:
        return 'Yes, this News is fake'
    else:
        return 'No, it is not fake'


In [None]:
news_text = "Donald Trumph Is President"

result = prediction_input_processing(news_text)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Yes, this News is fake


In [None]:
# setting threshold value
def best_threshold_value(thresholds:list,X_test):
    accuracies = []
    for thresh in thresholds:
        ypred =model.predict(X_test)
        ypred = np.where(ypred> thresh,1,0)
        accuracies.append(accuracy_score(y_test,ypred))
    return pd.DataFrame({
        'Threshold': thresholds,
        'Accuracy' : accuracies
    })

In [None]:
# Predictino value at threshold 0.4
y_pred = model.predict(X_test)
y_pred = np.where(y_pred >0.4, 1, 0)

[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step


In [None]:
# Confusion matrix
print('Confusion matrix')
print(confusion_matrix(y_pred,y_test))
print('----------------')
print('Classification report')
print(classification_report(y_pred,y_test))

Confusion matrix
[[5155  187]
 [ 175 5708]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      5342
           1       0.97      0.97      0.97      5883

    accuracy                           0.97     11225
   macro avg       0.97      0.97      0.97     11225
weighted avg       0.97      0.97      0.97     11225



In [None]:
# input generator
def prediction_input_processing(text):
    encoded = word_embedding(text)
    padded_encoded_title = pad_sequences([encoded],maxlen=max_length,padding = 'pre')
    output = model.predict(padded_encoded_title)
    output = np.where(0.4>output,1,0)
    if output[0][0] == 1:
        return 'Yes this News is fake'
    return 'No, It is not fake'

In [None]:
# predictions
prediction_input_processing('Americans are more concerned over Indians fake open source contribution')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step


'No, It is not fake'