In [29]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV
import re
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [30]:
df=pd.read_csv("covid_fake.csv")

In [31]:
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [32]:
df.shape

(1164, 4)

In [33]:
df['label'].value_counts()

TRUE    584
Fake    345
fake    230
Name: label, dtype: int64

In [34]:
df.loc[5:15]

Unnamed: 0,title,text,source,label
5,CORONA UNMASKED: Chinese Intelligence Officer ...,,,
6,,Urgent: Health Bulletin to the Public. Ministr...,Ministry of Health,Fake
7,,"Pls tell ur families, relatives and friendsMOH...",NWLLAB,Fake
8,,SERIOUS EXCELLENT ADVICE by Japanese doctors t...,Japanese doctors treating COVID-19 cases,Fake
9,Basic protective measures against the new coro...,Stay aware of the latest information on the CO...,https://www.who.int/emergencies/diseases/novel...,TRUE
10,,The new Coronavirus may not show signs of infe...,Taiwan Experts,Fake
11,,A vaccine meant for cattle can be used to figh...,facebook,Fake
12,,Using a hair dryer to breathe in hot air can c...,Youtube,Fake
13,,Corona virus before it reaches the lungs it re...,twitter,Fake
14,Exposing yourself to the sun or to temperature...,"You can catch COVID-19, no matter how sunny or...",https://www.who.int/emergencies/diseases/novel...,TRUE


In [35]:
df.isnull().sum()

title     82
text      10
source    20
label      5
dtype: int64

In [36]:
df.loc[df['label']=="Fake",['label']]=="FAKE"
df.loc[df['label']=="fake",['label']]=="FAKE"
df.loc[df['source']=="facebook",['label']]=="Facebook"
df.text.fillna(df.title,inplace=True)
df.loc[5]['label']='FAKE'
df.loc[15]['label']='TRUE'
df.loc[43]['label']='FAKE'
df.loc[131]['label']='TRUE'
df.loc[242]['label']='FAKE'
df.title.fillna('missing',inplace=True)
df.source.fillna('missing',inplace=True)
df['title_text']=df['title']+" "+df['text']

In [37]:
df['label'].value_counts()

TRUE    586
Fake    345
fake    230
FAKE      3
Name: label, dtype: int64

In [38]:
df.head()

Unnamed: 0,title,text,source,label,title_text
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake,Due to the recent outbreak for the Coronavirus...
1,missing,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake,missing Hydroxychloroquine has been shown to h...
2,missing,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake,missing Fact: Hydroxychloroquine has been show...
3,missing,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake,missing The Corona virus is a man made virus c...
4,missing,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake,missing Doesn’t @BillGates finance research at...


In [39]:
df.shape

(1164, 5)

In [40]:
df['title_text'][3]

'missing The Corona virus is a man made virus created in a Wuhan laboratory. Ask @BillGates who financed it.'

In [41]:
def preprocessor(text):
    text=re.sub('<[^>]*>','',text)
    text=re.sub(r'[^\w\s]','',text)
    text=re.sub(r'[\n]','',text)
    text=text.lower()
    return text
df['title_text']=df['title_text'].apply(preprocessor)
df['title_text'][3]

'missing the corona virus is a man made virus created in a wuhan laboratory ask billgates who financed it'

In [42]:
porter=PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [43]:
tfidf=TfidfVectorizer(strip_accents=None,
                     lowercase=False,
                     preprocessor=None,
                     tokenizer=tokenizer_porter,
                     use_idf=True,
                     norm='l2',
                     smooth_idf=True)
X=tfidf.fit_transform(df['title_text'])
y=df.label.values

In [44]:
X.shape

(1164, 27020)

In [46]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,\
                                              test_size=0.3, shuffle=False)


In [47]:
clf = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=0, 
                           n_jobs=-1,
                           verbose=0,
                           max_iter=300)

clf.fit(X_train, y_train)

fake_news_model = open('fake_news_model.sav', 'wb')
pickle.dump(clf, fake_news_model)
fake_news_model.close()


In [48]:
filename='fake_news_model.sav'
saved_clf=pickle.load(open(filename,'rb'))
saved_clf.score(X_test,y_test)

0.7514285714285714

In [49]:
from sklearn.metrics import classification_report, accuracy_score
y_pred=clf.predict(X_test)
print("___Test Set Results___")
print(classification_report(y_test,y_pred))

___Test Set Results___
              precision    recall  f1-score   support

        Fake       0.89      0.40      0.55       117
        TRUE       0.87      0.98      0.92       218
        fake       0.06      0.20      0.09        15

    accuracy                           0.75       350
   macro avg       0.61      0.53      0.52       350
weighted avg       0.84      0.75      0.76       350



In [50]:
clf.predict(X_test[59])

array(['fake'], dtype=object)

In [51]:
clf.predict(X_test[1])

array(['TRUE'], dtype=object)

In [52]:
test="Corona virus before it reaches the lungs"
inp=[test]
vect=tfidf.transform(inp)
prediction=clf.predict(vect)
print(prediction)

['fake']
