In [36]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV
import re
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [37]:
df=pd.read_csv('covid_fake.csv')

In [38]:
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [39]:
df.shape

(1164, 4)

In [40]:
df['label'].value_counts()

TRUE    584
Fake    345
fake    230
Name: label, dtype: int64

In [41]:
df.loc[5:15]

Unnamed: 0,title,text,source,label
5,CORONA UNMASKED: Chinese Intelligence Officer ...,,,
6,,Urgent: Health Bulletin to the Public. Ministr...,Ministry of Health,Fake
7,,"Pls tell ur families, relatives and friendsMOH...",NWLLAB,Fake
8,,SERIOUS EXCELLENT ADVICE by Japanese doctors t...,Japanese doctors treating COVID-19 cases,Fake
9,Basic protective measures against the new coro...,Stay aware of the latest information on the CO...,https://www.who.int/emergencies/diseases/novel...,TRUE
10,,The new Coronavirus may not show signs of infe...,Taiwan Experts,Fake
11,,A vaccine meant for cattle can be used to figh...,facebook,Fake
12,,Using a hair dryer to breathe in hot air can c...,Youtube,Fake
13,,Corona virus before it reaches the lungs it re...,twitter,Fake
14,Exposing yourself to the sun or to temperature...,"You can catch COVID-19, no matter how sunny or...",https://www.who.int/emergencies/diseases/novel...,TRUE


In [42]:
df.isna().sum()

title     82
text      10
source    20
label      5
dtype: int64

In [43]:
df.loc[df['label'] == 'Fake',['label']]='FAKE'
df.loc[df['label'] == 'fake',['label']]='FAKE'
df.loc[df['source'] == 'facebook',['source']]='Facebook'
df.text.fillna(df.title,inplace=True)
df.loc[5]['label']='FAKE'
df.loc[15]['label']='TRUE'
df.loc[43]['label']='FAKE'
df.loc[131]['label']='TRUE'
df.loc[242]['label']='FAKE'
df.title.fillna('misiing',inplace=True)
df.source.fillna('misiing',inplace=True)
df['title_text']=df['title']+' '+df['text']

In [44]:
df.isna().sum()

title         0
text          0
source        0
label         0
title_text    0
dtype: int64

In [45]:
df['label'].value_counts()

TRUE    586
FAKE    578
Name: label, dtype: int64

In [46]:
df.head()

Unnamed: 0,title,text,source,label,title_text
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,FAKE,Due to the recent outbreak for the Coronavirus...
1,misiing,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,FAKE,misiing Hydroxychloroquine has been shown to h...
2,misiing,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,FAKE,misiing Fact: Hydroxychloroquine has been show...
3,misiing,The Corona virus is a man made virus created i...,JoanneWrightForCongress,FAKE,misiing The Corona virus is a man made virus c...
4,misiing,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,FAKE,misiing Doesn’t @BillGates finance research at...


In [47]:
df.shape

(1164, 5)

In [48]:
df['title_text'][10]

'misiing The new Coronavirus may not show signs of infection for many days. How can you know if you are infected? By the time you have fever and/or cough and go to the hospital, the lung is usually 50% fibrosis. Taiwan experts provide a simple self-check that we can do every morning: Take a deep breath and hold it for more than 10 seconds. If you do this successfully without coughing, without discomfort, stiffness or tightness, there is no fibrosis in the lungs; it basically indicates no infection. In critical times, please self-check every morning in an environment with clean air.'

In [53]:
def preprocessor(text):
    text=re.sub('<[^>]*>','',text)
    text=re.sub(r'[^\w\s]','',text)
    text=re.sub(r'[\n]','',text)
    text=text.lower()
    return text
df['title_text']=df['title_text'].apply(preprocessor)
df['title_text'][10]

'misiing the new coronavirus may not show signs of infection for many days how can you know if you are infected by the time you have fever andor cough and go to the hospital the lung is usually 50 fibrosis taiwan experts provide a simple selfcheck that we can do every morning take a deep breath and hold it for more than 10 seconds if you do this successfully without coughing without discomfort stiffness or tightness there is no fibrosis in the lungs it basically indicates no infection in critical times please selfcheck every morning in an environment with clean air'

In [54]:
porter=PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [57]:
tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,tokenizer=tokenizer_porter,use_idf=True,norm='l2',smooth_idf=True)
X=tfidf.fit_transform(df['title_text'])
y=df.label.values

In [58]:
x.shape

(1164, 27021)

In [59]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,\
                                              test_size=0.3,shuffle=False)

In [60]:
clf=LogisticRegressionCV(cv=5,scoring='accuracy',random_state=0,n_jobs=-1,\
                        verbose=0,max_iter=300)
clf.fit(X_train,y_train)
fake_news_model=open('fake_news_model.sav','wb')
pickle.dump(clf,fake_news_model)
fake_news_model.close()

In [61]:
filename='fake_news_model.sav'
saved_clf=pickle.load(open(filename,'rb'))
saved_clf.score(X_test,y_test)

0.92

In [62]:
from sklearn.metrics import classification_report,accuracy_score
y_pred=clf.predict(X_test)
print("-Test set results-")
print(classification_report(y_test,y_pred))

-Test set results-
              precision    recall  f1-score   support

        FAKE       0.91      0.88      0.89       132
        TRUE       0.93      0.94      0.94       218

    accuracy                           0.92       350
   macro avg       0.92      0.91      0.91       350
weighted avg       0.92      0.92      0.92       350



In [63]:
clf.predict(X_test[59])

array(['FAKE'], dtype=object)

In [65]:
test="Corona virus before it reaches the lungs"
inp=[test]
vect=tfidf.transform(inp)
prediction=clf.predict(vect)
print(prediction)

['FAKE']
