# Fake News Preprocessing
This notebook details the data cleaning and tokenization needed for our models.

For the original dataset, visit [here](https://github.com/several27/FakeNewsCorpus). The dataset is about 9.5GB zipped and 30GB unzipped.

## Imports

In [65]:
import pandas as pd
import sklearn as sk
import numpy as np
import gzip
import shutil
from segtok import tokenizer
from collections import Counter
import json
import os

## Cleaning


In [41]:
old_dataset = "data/raw/news_cleaned_2018_02_13.csv"
new_dataset = "data/clean/fake_reliable_news_headlines.csv"

columns = {'id':int, 'type':str, 'title':str}
allowable_types = ['fake', 'reliable']

chunksize = 500

In [12]:
# create the new dataset as a csv
with open(old_dataset, 'r') as f_old, open(new_dataset, 'w') as f_new:
    f_new.write(','.join(columns.keys()) + '\n')
    for df in pd.read_csv(f_old, chunksize=chunksize, error_bad_lines=False):
        df = df.loc[:, columns]
        df = df[df['type'].isin(allowable_types)]
        df.to_csv(f_new, header=False, index=False)

In [5]:
# optional to gzip csv, this takes a while
with open(new_dataset, 'rb') as f_in, gzip.open(new_dataset+'.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

## Preprocessing

In [43]:
df_dataset = pd.read_csv(new_dataset, dtype=columns)
df_dataset.head()

Unnamed: 0,id,type,title
0,34,fake,Surprise: Socialist Hotbed Of Venezuela Has Lo...
1,35,fake,Water Cooler 1/25/18 Open Thread; Fake News ? ...
2,36,fake,Veteran Commentator Calls Out the Growing “Eth...
3,37,fake,"Lost Words, Hidden Words, Otters, Banks and Books"
4,38,fake,Red Alert: Bond Yields Are SCREAMING “Inflatio...


In [45]:
df_fake = df_dataset[df_dataset.type == 'fake']
df_reliable = df_dataset[df_dataset.type == 'reliable']
print(df_fake.shape, df_reliable.shape)
df_fake.dtypes

(894746, 3) (1913222, 3)


id        int64
type     object
title    object
dtype: object

In [122]:
sample_size = 100000
df_dataset = sk.utils.shuffle(pd.concat([df_fake.sample(sample_size), df_reliable.sample(sample_size)]))
df_dataset.reset_index(inplace=True, drop=True)
df_dataset.head()

Unnamed: 0,id,type,title
0,8848385,reliable,"Dude, You're Getting A Loan"
1,9381772,reliable,Popularity of Juniors Tournament Crosses Border
2,7995139,reliable,"After IS kidnap and oppression, Iraqi girls ea..."
3,3171691,fake,Is The Internet Private Property Now?
4,3147319,fake,DOJ Broke It’s Own rules In AP Investigation


In [123]:
df_dataset['tokenized'] = np.nan
df_dataset['tokenized'] = df_dataset['tokenized'].astype(object)
word_counts = Counter()
for i, row in df_dataset.iterrows():
    df_dataset.at[i, 'tokenized'] = tokenizer.word_tokenizer(str(row['title']).lower())
    word_counts.update(df_dataset.loc[i, 'tokenized'])
print(word_counts.most_common(30))

[(',', 56353), ('the', 49546), (':', 41714), ('to', 34696), ('in', 30599), ('of', 29100), ('a', 25239), ('and', 22009), ('for', 20236), ('on', 15788), ('is', 11447), ('?', 10183), ('–', 9756), ('paid', 9026), ('notice', 8970), ('with', 8708), ('new', 8301), ('deaths', 8254), ('at', 7973), ('(', 6765), (')', 6703), ('‘', 6183), ('by', 6098), ('-', 5997), ('!', 5987), ('.', 5820), ("'", 5792), ('from', 5728), (';', 5119), ('as', 5063)]


In [50]:
# Creating the vocab
vocab_size = 20000
special_words = ["<START>", "UNK", "PAD"]
vocabulary = special_words + [w for w, c in word_counts.most_common(vocab_size-len(special_words))]
w2i = {w: i for i, w in enumerate(vocabulary)}

In [84]:
# Numerizing and padding
input_length = 20
unkI, padI, startI = w2i['UNK'], w2i['PAD'], w2i['<START>']

def numerize_sequence(tokenized):
    return [w2i.get(w, unkI) for w in tokenized]

def pad_sequence(numerized, pad_index, to_length):
    pad = numerized[:to_length]
    padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

In [128]:
df_dataset['numerized'] = np.nan
df_dataset['numerized'] = df_dataset['numerized'].astype(object)
df_dataset['mask'] = np.nan
df_dataset['mask'] = df_dataset['mask'].astype(object)
for i, row in df_dataset.iterrows():
    df_dataset.at[i, 'numerized'] = numerize_sequence(row['tokenized']) # Change words to IDs
    df_dataset.at[i, 'numerized'], df_dataset.at[i, 'mask'] = pad_sequence(df_dataset.loc[i, 'numerized'], padI, input_length) # Append appropriate PAD tokens

In [137]:
# Compute fraction of words that are UNK:
word_counters = Counter([w for i, r in df_dataset.iterrows() for w in r['numerized'] if w != padI])
print("Fraction of UNK words:", float(word_counters[unkI]) / sum(word_counters.values()))

Fraction of UNK words: 0.08393581712373868


In [138]:
df_dataset.head()

Unnamed: 0,id,type,title,tokenized,numerized,mask
0,8848385,reliable,"Dude, You're Getting A Loan","[dude, ,, you're, getting, a, loan]","[6941, 3, 3649, 548, 9, 2020, 2, 2, 2, 2, 2, 2...","[True, True, True, True, True, True, False, Fa..."
1,9381772,reliable,Popularity of Juniors Tournament Crosses Border,"[popularity, of, juniors, tournament, crosses,...","[9743, 8, 1, 4547, 7050, 737, 2, 2, 2, 2, 2, 2...","[True, True, True, True, True, True, False, Fa..."
2,7995139,reliable,"After IS kidnap and oppression, Iraqi girls ea...","[after, is, kidnap, and, oppression, ,, iraqi,...","[45, 13, 9351, 10, 1, 3, 1054, 1256, 7027, 6, ...","[True, True, True, True, True, True, True, Tru..."
3,3171691,fake,Is The Internet Private Property Now?,"[is, the, internet, private, property, now, ?]","[13, 4, 141, 665, 1328, 73, 14, 2, 2, 2, 2, 2,...","[True, True, True, True, True, True, True, Fal..."
4,3147319,fake,DOJ Broke It’s Own rules In AP Investigation,"[doj, broke, it’s, own, rules, in, ap, investi...","[3685, 4277, 191, 355, 435, 7, 2880, 1036, 2, ...","[True, True, True, True, True, True, True, Tru..."


In [140]:
processed_dataset = "data/processed/fake_reliable_news_headlines.json.gz"
df_dataset.to_json(processed_dataset, compression='gzip')