## Download CSV file

In [1]:
import requests
content = requests.get("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")
with open("news_sample.csv", "w", encoding="utf-8") as f:
    f.write(content.text)# Read file:
news_sample = open("news_sample.csv", "r", encoding="utf-8")
text = news_sample.read()

Manually inspect file contents:

In [2]:
# print(text)

## Cleaning data:

In [3]:
from cleantext import clean
cleantext = clean(  text,
                    lower=True,
                    no_line_breaks=True,
                    no_urls=True,
                    no_emails=True,
                    no_numbers=True,
                    replace_with_url="URL",replace_with_email="EMAIL",
                    replace_with_number="NUM", #Dates are replaced as numbers.
                )

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [4]:
#print(cleantext)

## Tokenizing:

In [5]:
import nltk
#nltk.download('all')

tokens = nltk.word_tokenize(cleantext)
tagged = nltk.pos_tag(tokens) #No idea what this represent
#print(tokens)
#print(tagged[:100])

## Filtering

### Creating vocabulary (list without duplicates) and removing stopwords:

In [6]:
from nltk.corpus import stopwords

ordered_tokens = set()
vocab = []

for word in tokens:
    if word not in ordered_tokens:
        ordered_tokens.add(word)
        vocab.append(word)
vocab_no_stopwords =[]
for word in vocab:
    if word not in stopwords.words('english'):
        vocab_no_stopwords.append(word)

#print(vocab[:100])
#print(vocab_no_stopwords[:100])

Length of vocab before and after removing stopwords:

In [7]:
print(len(vocab))
print(len(vocab_no_stopwords))

17446
17314


Reduction rate of vocabulary size after removing stopwords:

In [8]:
print((len(vocab_no_stopwords)-len(vocab))/len(vocab)*100)

-0.7566204287515763


### Removing word variations with stemming

In [9]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stemmed_vocab = []

for word in vocab_no_stopwords:
    if word not in stemmed_vocab: stemmed_vocab.append(stemmer.stem(word))
#print(stemmed_vocab)

Length of vocab before and after stemming:

In [10]:
print(len(vocab_no_stopwords))
print(len(stemmed_vocab))

17314
16521


Reduction rate of vocabulary size after stemming words:

In [11]:
print((len(stemmed_vocab)-len(vocab_no_stopwords))/len(vocab_no_stopwords)*100)

-4.580108582649879
