Import

In [2]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.68-py2.py3-none-any.whl (8.1 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 9.9 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 15.9 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.68 pyahocorasick-1.4.4 textsearch-0.0.21


In [19]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rcParams
plt.rcParams['figure.figsize'] = [10,10]
import seaborn as sns
sns.set_theme(style="darkgrid")
from wordcloud import WordCloud

import nltk
from nltk import sent_tokenize
''' Required for Google Colab'''
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
''' Required for Google Colab'''
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
from nltk.tokenize import word_tokenize

import contractions
import re
import itertools
import datetime
import time
from collections import Counter
import string

import warnings

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Original version

In [4]:
train_df = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/Constraint_Train.csv', header=0)
val_df = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/Constraint_Val.csv', header=0)

In [5]:
def preprocess_text(x):
  cleaned_text = re.sub(r'[^a-zA-Z\d\s\']+', '', x)
  word_list = []
  for each_word in cleaned_text.split(' '):
    try:
      word_list.append(contractions.fix(each_word).lower())
    except:
      print(x)
  return " ".join(word_list)

In [6]:
text_cols = ['tweet']

In [7]:
%%time
for col in text_cols:
  print("Processing column: {}".format(col))
  train_df[col] = train_df[col].apply(lambda x: preprocess_text(x))
  val_df[col] = val_df[col].apply(lambda x: preprocess_text(x)) 

Processing column: tweet
CPU times: user 1.55 s, sys: 34 ms, total: 1.58 s
Wall time: 2.09 s


In [8]:
%%time
for col in text_cols:
  print("Processing column: {}".format(col))
  train_df[col] = train_df[col].apply(word_tokenize)
  val_df[col] = val_df[col].apply(word_tokenize)

Processing column: tweet
CPU times: user 2.81 s, sys: 45.3 ms, total: 2.85 s
Wall time: 4.83 s


In [9]:
%%time
for col in text_cols:
  print("Processing column: {}".format(col))
  train_df[col] = train_df[col].apply(lambda x: [each_word for each_word in x if each_word not in stopwords])
  val_df[col] = val_df[col].apply(lambda x: [each_word for each_word in x if each_word not in stopwords])

Processing column: tweet
CPU times: user 965 ms, sys: 8.34 ms, total: 973 ms
Wall time: 1.43 s


In [39]:
train_df.head()

Unnamed: 0,id,tweet,label
0,1,"[cdc, currently, reports, 99031, deaths, gener...",real
1,2,"[states, reported, 1121, deaths, small, rise, ...",real
2,3,"[politically, correct, woman, almost, uses, pa...",fake
3,4,"[indiafightscorona, 1524, covid, testing, labo...",real
4,5,"[populous, states, generate, large, case, coun...",real


In [10]:
all_tokenized_gen = [a for b in train_df[train_df['label']=="real"]['tweet'].tolist() for a in b]
all_tokenized_fake = [a for b in train_df[train_df['label']=="fake"]['tweet'].tolist() for a in b]

In [11]:
def get_post_tags_list(tokenized_articles):
  all_pos_tags = []
  for word in tokenized_articles:
    pos_tag = nltk.pos_tag([word])[0][1]
    all_pos_tags.append(pos_tag)
  return all_pos_tags

In [12]:
%%time
all_pos_tagged_word_gen = get_post_tags_list(all_tokenized_gen)
all_pos_tagged_word_fake = get_post_tags_list(all_tokenized_fake)

CPU times: user 15.5 s, sys: 875 ms, total: 16.4 s
Wall time: 17 s


In [13]:
all_pos_tagged_word_gen[:5], all_pos_tagged_word_fake[:5]

(['NN', 'RB', 'NNS', 'CD', 'NNS'], ['RB', 'NN', 'NN', 'RB', 'NNS'])

In [14]:
gen_pos_df = pd.DataFrame(dict(Counter(all_pos_tagged_word_gen)).items(), columns=['Pos_tag', 'Genuine News'])
fake_pos_df = pd.DataFrame(dict(Counter(all_pos_tagged_word_fake)).items(), columns=['Pos_tag', 'Fake News'])

In [15]:
pos_df = gen_pos_df.merge(fake_pos_df, on='Pos_tag')

In [16]:
pos_df['Genuine News'] = pos_df['Genuine News'] * 100 / pos_df['Genuine News'].sum()
pos_df['Fake News'] = pos_df['Fake News'] * 100 / pos_df['Fake News'].sum()
pos_df.head()

Unnamed: 0,Pos_tag,Genuine News,Fake News
0,NN,47.93755,58.329998
1,RB,2.826293,2.718991
2,NNS,16.485008,13.858612
3,CD,9.609688,3.234539
4,JJ,8.321296,7.078792


### Version based on our pre-processing

In [24]:
train_df2 = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/blob/main/data/original-data/Constraint_Train.csv', header=0)
val_df2 = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/original-data/Constraint_Val.csv', header=0)

In [25]:
stopwords = nltk.corpus.stopwords.words('english')

def cleantext(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    tx = text.replace('&amp',' ')
    text = re.sub(r"[^a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stopwords]
    #text = " ".join(text)
    return text

In [27]:
train_df2['tweet'] = train_df2['tweet'].map(lambda x: cleantext(x))
val_df2['tweet'] = val_df2['tweet'].map(lambda x: cleantext(x))

In [31]:
train_df2.head()

Unnamed: 0,id,tweet,label
0,1,"[cdc, currently, reports, deaths, general, dis...",real
1,2,"[states, reported, deaths, small, rise, last, ...",real
2,3,"[politically, correct, woman, almost, uses, pa...",fake
3,4,"[indiafightscorona, covid, testing, laboratori...",real
4,5,"[populous, states, generate, large, case, coun...",real


In [29]:
all_tokenized_gen2 = [a for b in train_df2[train_df2['label']=="real"]['tweet'].tolist() for a in b]
all_tokenized_fake2 = [a for b in train_df2[train_df2['label']=="fake"]['tweet'].tolist() for a in b]

In [None]:
all_tokenized_gen2

In [32]:
def get_post_tags_list(tokenized_articles):
  all_pos_tags = []
  for word in tokenized_articles:
    pos_tag = nltk.pos_tag([word])[0][1]
    all_pos_tags.append(pos_tag)
  return all_pos_tags

In [34]:
all_pos_tagged_word_gen2 = get_post_tags_list(all_tokenized_gen2)
all_pos_tagged_word_fake2 = get_post_tags_list(all_tokenized_fake2)

In [62]:
all_pos_tagged_word_gen2[:5], all_pos_tagged_word_fake2[:5]

(['NN', 'RB', 'NNS', 'NNS', 'JJ'], ['RB', 'NN', 'NN', 'RB', 'NNS'])

In [63]:
gen_pos_df2 = pd.DataFrame(dict(Counter(all_pos_tagged_word_gen2)).items(), columns=['Pos_tag', 'Genuine News'])
fake_pos_df2 = pd.DataFrame(dict(Counter(all_pos_tagged_word_fake2)).items(), columns=['Pos_tag', 'Fake News'])

In [64]:
pos_df2 = gen_pos_df2.merge(fake_pos_df2, on='Pos_tag')

In [65]:
pos_df2['Genuine News'] = pos_df2['Genuine News'] * 100 / pos_df2['Genuine News'].sum()
pos_df2['Fake News'] = pos_df2['Fake News'] * 100 / pos_df2['Fake News'].sum()
pos_df2.head()

Unnamed: 0,Pos_tag,Genuine News,Fake News
0,NN,51.321888,59.684574
1,RB,3.201906,2.86348
2,NNS,18.542333,14.413504
3,JJ,9.462106,7.59241
4,VBD,1.39043,1.444061


In [45]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [70]:
nltk.help.upenn_tagset("VBD")

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...
