## **Fake news detection ML model**


In [2]:
# importing the necessary packages
import pandas as pd
import matplotlib.pyplot as pt
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy


Reading the dataset


In [3]:
data = pd.read_csv('IFND.csv', encoding="ISO-8859-1")
data


Unnamed: 0,id,Statement,Image,Web,Category,Date,Label
0,2,"WHO praises India's Aarogya Setu app, says it ...",https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,COVID-19,Oct-20,TRUE
1,3,"In Delhi, Deputy US Secretary of State Stephen...",https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,VIOLENCE,Oct-20,TRUE
2,4,LAC tensions: China's strategy behind delibera...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,TERROR,Oct-20,TRUE
3,5,India has signed 250 documents on Space cooper...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,COVID-19,Oct-20,TRUE
4,6,Tamil Nadu chief minister's mother passes away...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,ELECTION,Oct-20,TRUE
...,...,...,...,...,...,...,...
56709,56711,Fact Check: This is not Bruce Lee playing ping...,https://akm-img-a-in.tosshub.com/indiatoday/im...,INDIATODAY,MISLEADING,11-2019,Fake
56710,56712,Fact Check: Did Japan construct this bridge in...,https://akm-img-a-in.tosshub.com/indiatoday/im...,INDIATODAY,COVID-19,10-2019,Fake
56711,56713,Fact Check: Viral video of Mexico earthquake i...,https://akm-img-a-in.tosshub.com/indiatoday/im...,INDIATODAY,MISLEADING,10-2019,Fake
56712,56714,Fact Check: Ballet performance by Chinese coup...,https://akm-img-a-in.tosshub.com/indiatoday/im...,INDIATODAY,COVID-19,9-2019,Fake


In [4]:
data.shape


(56714, 7)

In [5]:
data.isnull().sum()


id               0
Statement        0
Image            0
Web              0
Category         0
Date         11321
Label            0
dtype: int64

In [6]:
data = data.drop(columns=['id', 'Image', 'Date'])


In [7]:
data


Unnamed: 0,Statement,Web,Category,Label
0,"WHO praises India's Aarogya Setu app, says it ...",DNAINDIA,COVID-19,TRUE
1,"In Delhi, Deputy US Secretary of State Stephen...",DNAINDIA,VIOLENCE,TRUE
2,LAC tensions: China's strategy behind delibera...,DNAINDIA,TERROR,TRUE
3,India has signed 250 documents on Space cooper...,DNAINDIA,COVID-19,TRUE
4,Tamil Nadu chief minister's mother passes away...,DNAINDIA,ELECTION,TRUE
...,...,...,...,...
56709,Fact Check: This is not Bruce Lee playing ping...,INDIATODAY,MISLEADING,Fake
56710,Fact Check: Did Japan construct this bridge in...,INDIATODAY,COVID-19,Fake
56711,Fact Check: Viral video of Mexico earthquake i...,INDIATODAY,MISLEADING,Fake
56712,Fact Check: Ballet performance by Chinese coup...,INDIATODAY,COVID-19,Fake


DATA PROCESSING :

1.  Make text lowercase
2.  Remove punctuation
3.  Remove emoji’s
4.  Remove stopwords
5.  Lemmatization


In [8]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


In [9]:
############## Converting statements into lowercase ##############
########## To remove punctuation #######
data['new_Statement'] = data['Statement'].apply(lambda x: " ".join(
    x.lower() for x in x.split())).str.replace('[^\w\s]', '')
data['new_Statement'].shape


  x.lower() for x in x.split())).str.replace('[^\w\s]', '')


(56714,)

In [10]:
################## REMOVING EMOJIS####################
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


data['new_Statement'] = data['new_Statement'].apply(lambda x: remove_emoji(x))
data['new_Statement']


0        who praises indias aarogya setu app says it he...
1        in delhi deputy us secretary of state stephen ...
2        lac tensions chinas strategy behind deliberate...
3        india has signed 250 documents on space cooper...
4        tamil nadu chief ministers mother passes away ...
                               ...                        
56709    fact check this is not bruce lee playing pingp...
56710    fact check did japan construct this bridge in ...
56711    fact check viral video of mexico earthquake is...
56712    fact check ballet performance by chinese coupl...
56713    fact check is this little boy crossing into jo...
Name: new_Statement, Length: 56714, dtype: object

In [11]:
################### REMOVING STOP WORDS###################
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
data['new_Statement'] = data['new_Statement'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))
data['new_Statement'].shape


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(56714,)

In [12]:
############### lemmatization using Spacy so that we can count the appearance of each word. Lemmatization removes the grammar tense and transforms each word into its original form. #######
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])


data['new_Statement'] = data['new_Statement'].apply(space)
data['new_Statement'].head(30)


0     praise indias aarogya setu app say help identi...
1     delhi deputy us secretary state stephen biegun...
2     lac tension china strategy behind deliberately...
3     india sign 250 document space cooperation 59 c...
4         tamil nadu chief minister mother pass away 93
5     bihar assembly election 2020 tej pratap shift ...
6     hathras case cbi reach victim village visit cr...
7     rajasthan crime news karauli another elderly b...
8     mumbai bmc book penalise people step without f...
9     covid19 indias singleday spike drop 55342 tall...
10    amid stubble burn delhis air quality deteriora...
11    bihar assembly election bjp expel nine rebel c...
12    pm modi release balasaheb vikhe patil autobiog...
13    post office recruitment 2020 big vacancy 1371 ...
14             mumbai power outage fire report hospital
15    tamil nadu covid recovery touch sixlakh mark a...
16    indian export armenia increase threefold past ...
17          7 indian hostage free libya good hea

In [13]:
data.head()


Unnamed: 0,Statement,Web,Category,Label,new_Statement
0,"WHO praises India's Aarogya Setu app, says it ...",DNAINDIA,COVID-19,True,praise indias aarogya setu app say help identi...
1,"In Delhi, Deputy US Secretary of State Stephen...",DNAINDIA,VIOLENCE,True,delhi deputy us secretary state stephen biegun...
2,LAC tensions: China's strategy behind delibera...,DNAINDIA,TERROR,True,lac tension china strategy behind deliberately...
3,India has signed 250 documents on Space cooper...,DNAINDIA,COVID-19,True,india sign 250 document space cooperation 59 c...
4,Tamil Nadu chief minister's mother passes away...,DNAINDIA,ELECTION,True,tamil nadu chief minister mother pass away 93


In [14]:
from sklearn.model_selection import train_test_split as tts
X = data[['new_Statement', 'Category']].values
y = data['Label'].values
X_train, X_test, Y_train, Y_test = tts(X, y, test_size=0.3)
print(Y_train)
print(Y_test)
# print(Y_train.groupby('Label').count())
# print(Y_test.groupby(level = 0 ).count())
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)


['TRUE' 'TRUE' 'TRUE' ... 'Fake' 'TRUE' 'Fake']
['TRUE' 'TRUE' 'TRUE' ... 'TRUE' 'TRUE' 'TRUE']
(39699, 2) (39699,)
(17015, 2) (17015,)


In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split as tts
# Preprocess the text data


def preprocess(text):
    # TODO: Add your preprocessing steps here
    return text


data['new_Statement'] = data['new_Statement'].apply(preprocess)

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = tts(
    data['new_Statement'], data['Label'], test_size=0.2, random_state=42)

# Bag-of-words feature extraction
count_vectorizer = CountVectorizer(stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Tfidf feature extraction
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Print the shape of the feature matrices
print(X_train, X_test, Y_train, Y_test)
print('CountVectorizer:', X_train_count.shape, X_test_count.shape)
print('TfidfVectorizer:', X_train_tfidf.shape, X_test_tfidf.shape)


28912    feel like I m rape say karnataka speaker mlas ...
45082     fact check bjp man torture youth lockdown hereõs
30233                first time lpg sell petrol india 2020
47539    man force say camera childkidnapper video use ...
37155    edelweiss fund still bullish indiaï½s runaway ...
                               ...                        
54343    fact check picture hizbul commander zakir musa...
38158         fact check viral photo flood bihar australia
860      sushant singh rajput case ûuddhav never interf...
15795    11 migrant way home die 14 other injure separa...
56422              donï½t believe picture pm modi take dip
Name: new_Statement, Length: 45371, dtype: object 25353    man phone what s like make history high auctio...
4091     pm states start plan vaccine rollout maintain ...
28346    du survey show akhileshmaya popular modi priya...
16648    register first case anticonversion law bareill...
41227    fact check old video statue ganesha immerse ri...
      

In [25]:
from sklearn.metrics import accuracy_score, f1_score

# Train the logistic regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, Y_train)

# Predict the labels of the test set
Y_pred_lr = lr.predict(X_test_tfidf)

# Calculate the evaluation metrics
print('Accuracy:', accuracy_score(Y_test, Y_pred_lr))
print('Precision:', precision_score(Y_test, Y_pred_lr, pos_label='Fake'))
# print('Recall:', recall_score(Y_test, Y_pred_lr, pos_label='TRUE'))
print('F1-score:', f1_score(Y_test, Y_pred_lr, pos_label='Fake'))


Accuracy: 0.932733844661906
Precision: 0.9713574097135741
F1-score: 0.8910466942738826


In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

# Create an instance of the Logistic Regression model
lr_model = LogisticRegression(class_weight='balanced')

# Fit the model to the training data
lr_model.fit(X_train_tfidf, Y_train)

# Predict the labels of the test data
Y_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluate the performance of the model

print('Accuracy:', accuracy_score(Y_test, Y_pred_lr))
# print('Precision:', precision_score(Y_test, Y_pred_lr, pos_label='Fake'))
# print('Precision:', precision_score(Y_test, Y_pred_lr, pos_label='TRUE'))
# print('Recall:', recall_score(Y_test, Y_pred_lr, pos_label='Fake'))
# print('Recall:', recall_score(Y_test, Y_pred_lr, pos_label='TRUE'))
# print('F1-score:', f1_score(Y_test, Y_pred_lr, pos_label='Fake'))
# print('F1-score:', f1_score(Y_test, Y_pred_lr, pos_label='TRUE'))


Accuracy: 0.9367892092039143
Precision: 0.9431949250288351
Precision: 0.933968253968254
Recall: 0.8628330255869164
Recall: 0.9739141949152542
F1-score: 0.9012260641961704
F1-score: 0.9535230440137421


In [29]:
new_data = ["This news is completely fake"]
new_data_tfidf = tfidf_vectorizer.transform(new_data)
predicted_label = lr_model.predict(new_data_tfidf)
print(predicted_label)


['Fake']


In [30]:
new_data = [
    "WHO praises India's Aarogya Setu app, says it helped in identifying COVID-19 clusters"]
new_data_tfidf = tfidf_vectorizer.transform(new_data)
predicted_label = lr_model.predict(new_data_tfidf)
print(predicted_label)


['TRUE']


In [31]:
new_data = ["Flagging a dubious claim: Flags hoisted atop houses in Jalandhar have no Pakistan link.Fact Check: This is not an RSS man held for waving Pakistani flag in burqa"]
new_data_tfidf = tfidf_vectorizer.transform(new_data)
predicted_label = lr_model.predict(new_data_tfidf)
print(predicted_label)


['Fake']


**DECISION TREE MODEL**


In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_model.fit(X_train_tfidf, Y_train)

# Predict the labels of the test data
Y_pred_dt = dt_model.predict(X_test_tfidf)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(Y_test, Y_pred_dt))
print('Precision:', precision_score(Y_test, Y_pred_dt, pos_label='TRUE'))
print('Precision:', precision_score(Y_test, Y_pred_dt, pos_label='Fake'))
print('Recall:', recall_score(Y_test, Y_pred_dt, pos_label='TRUE'))
print('Recall:', recall_score(Y_test, Y_pred_dt, pos_label='Fake'))
print('F1-score:', f1_score(Y_test, Y_pred_dt, pos_label='TRUE'))
print('F1-score:', f1_score(Y_test, Y_pred_dt, pos_label='Fake'))


Accuracy: 0.909459578594728
Precision: 0.9350580077343645
Precision: 0.8595213319458896
Recall: 0.9284957627118644
Recall: 0.8715378528092852
F1-score: 0.9317653312072287
F1-score: 0.8654878847413229


In [34]:
######### Save Logistic Regression Model  #########
import joblib

# Save the model to a file
joblib.dump(lr_model, 'lr_model.joblib')


['lr_model.joblib']

In [None]:
###### To load the model and use it for prediction ######

# Load the saved model from a file

# loaded_lr_model = joblib.load('lr_model.joblib')


In [28]:
# from sklearn.linear_model import LogisticRegression

# # Train logistic regression model
# lr = LogisticRegression()
# lr.fit(X_train_count, y_train)

# # Evaluate logistic regression model on test set
# lr_accuracy = lr.score(X_test_count, y_test)
# print("Logistic Regression Accuracy:", lr_accuracy)


In [27]:
# import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA
# # Perform PCA on the count feature matrix
# pca = PCA(n_components=2)
# X_train_count_pca = pca.fit_transform(X_train_count.toarray())
# # Plot the PCA results
# plt.scatter(X_train_count_pca[:1000, 0],
#             X_train_count_pca, c=Y_train[:1000, 0])
# plt.title('PCA on CountVectorizer Feature Matrix')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.show()

# # Perform PCA on the tfidf feature matrix
# pca = PCA(n_components=2)
# X_train_tfidf_pca = pca.fit_transform(X_train_tfidf.toarray())

# # Plot the PCA results
# plt.scatter(X_train_tfidf_pca[:, 0], X_train_tfidf_pca[:, 1], c=Y_train)
# plt.title('PCA on TfidfVectorizer Feature Matrix')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.show()


In [None]:
data.groupby(['Label', 'Web']).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,Statement,Category,new_Statement
Label,Web,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fake,AFP,267,267,267
Fake,ALT,59,59,59
Fake,AUGMENT,11321,11321,11321
Fake,BOOMLIVE,806,806,806
Fake,DIGITEYE,175,175,175
Fake,FACTCHECKER,163,163,163
Fake,FACTCRESCENDO,272,272,272
Fake,INDIATODAY,1606,1606,1606
Fake,NEWSMETER,506,506,506
Fake,NEWSMOBILE,2190,2190,2190


1.  X consists of the independent variables and Y consists of the target variable(Label(True,Fake)).
2.  Then we divided the dataset in training and testing data.
3.  WE'll trian the model using the training data and after that we'll test it using the testing data


In [None]:
# This cell does the divison of training and testing data using the train_test_split function
from sklearn.model_selection import train_test_split as tts
X = data.drop(columns=['Label'])
Y = data['Label']
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.3)
# print(Y_train.groupby('Label').count())
print(Y_test.groupby(level=0).count())


15       1
16       1
17       1
18       1
19       1
        ..
56698    1
56701    1
56703    1
56707    1
56712    1
Name: Label, Length: 17015, dtype: int64
