In [1]:
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import matplotlib_inline
import re
import math
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import nltk
from nltk.corpus import stopwords
import os

In [2]:
filename = '995,000_rows.csv'
df = pd.read_csv(filename, usecols=['content', 'type'])

In [3]:
def clean_text(text):
    """Redacts URLs, dates, email addresses and numbers in a given text input, as well as converting text to lower case and removing tabs, newlines, and spaces following other spaces"""
    text = str(text)
    date_exp =  {
                "year_mm_dd" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),   
                "dd_mm_year" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "mm_dd_year" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "year_mm_dd_time" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "dd_mm_year_time" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "mm_dd_year_time" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "year_mm_dd_hh_mm" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2})', re.MULTILINE),
                "dd_mm_year_hh_mm" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2})', re.MULTILINE),
                "mm_dd_year_hh_mm" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2})', re.MULTILINE),
                "year_mm_dd_hh_mm_ss" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2})', re.MULTILINE),
                "dd_mm_year_hh_mm_ss" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2})', re.MULTILINE),
                "mm_dd_year_hh_mm_ss" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2})', re.MULTILINE),
                }
    num2_exp = re.compile(r'([0-9]+)((st)?(nd)?(rd)?(th)?(st)?){1}')
    num_exp = re.compile('[0-9]+[,.]?[0-9]*', re.MULTILINE)
    url_exp = re.compile(r'((h{1}t{2}p{1}s?\:{1}\/{2})|(w{3}\.{1})){0,2}[^,\s]*\.[a-zA-Z]{2,}[^,\s]*', re.MULTILINE)
    email_exp = re.compile(r'[^,\s\/]*@{1}[^,\s\/]*\.[a-zA-Z]{2,3}', re.MULTILINE)
    space_exp = re.compile(r'([\s]{2,})|[\t]|[\n]+', re.MULTILINE)
    punctuation_exp = re.compile(r'[^\w\s]', re.MULTILINE)

    text = text.lower()
    for exp in date_exp.values():
        text = exp.sub('datetoken', text) #Replace dates before numbers
    text = num2_exp.sub('numtoken', text)
    text = num_exp.sub('numtoken', text)
    text = url_exp.sub('urltoken', text)
    text = email_exp.sub('emailtoken', text)
    text = space_exp.sub(' ', text)
    text = punctuation_exp.sub(' ', text)
    return text

def clean_text_series(series):
    return series.apply(clean_text)

def tokenize_and_stem_series(series):
    stop_words = set(stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    return series.apply(lambda x: [stemmer.stem(token) for token in nltk.word_tokenize(x) if token not in stop_words])

In [4]:
def f_NUM(x):
    count = 0
    for word in x:
        if word == 'numtoken':
            count += 1
    return count
def f_URL(x):
    count = 0
    for word in x:
        if word == 'urltoken':
            count += 1
    return count
    
def f_EMAIL(x):
    count = 0
    for word in x:
        if word == 'emailtoken':
            count += 1
    return count
def f_DATE(x):
    count = 0
    for word in x:
        if word == 'datetoken':
            count += 1
    return count

def df_chunker(df, chunksize):
    list_df = np.array_split(df, math.ceil(len(df) / chunksize))
    del df
    j = 1
    for df in list_df:
        print(f"Processing chunk {j} of {len(list_df)}:")
        # Check if chunk parquet already exists
        try:
            pq.read_table(f"chunk_{j}.parquet").to_pandas()
            print(f"Chunk {j} already processed!", flush=True)
            chunk = pd.read_parquet(f"chunk_{j}.parquet")
            j += 1
            continue
        except:
            pass

        print("Cleaning content...", flush=True)
        df['content'] = clean_text_series(df['content'])
        print("\n", end="\r", flush=True)

        print("Tokenizing, stemming and removing stopwords from content...", flush=True)
        df['content'] = tokenize_and_stem_series(df['content'])
        print("\n", end="\r", flush=True)

        print("Calculating features...", end="\r", flush=True)
        df['length'] = df['content'].apply(len)
        df['distinct_words'] = df['content'].apply(set)
        df['length_distinct_words'] = df['distinct_words'].apply(len)
        df['group'] = df['type'].apply(lambda x: 1 if x in ['fake', 'satire', 'bias', 'conspiracy', 'junksci', 'hate'] else 0)

        df['numtokens'] = df['content'].apply(f_NUM)
        df['urltokens'] = df['content'].apply(f_URL)
        df['emailtokens'] = df['content'].apply(f_EMAIL)
        df['datetokens'] = df['content'].apply(f_DATE)
        print("\n", end="\r", flush=True)

        print("Saving chunk...", end="\r", flush=True)
        df.to_parquet(f"chunk_{j}.parquet")
        print(f"Chunk {j} done!", flush=True)
        
        j += 1
    df = pd.concat([pd.read_parquet(f"chunk_{j}.parquet") for j in range(1, (len(list_df) + 1))])
    del list_df
    return df

In [5]:
df = df_chunker(df, 50000)

  return bound(*args, **kwds)


Processing chunk 1 of 20:
Chunk 1 already processed!
Processing chunk 2 of 20:
Chunk 2 already processed!
Processing chunk 3 of 20:
Chunk 3 already processed!
Processing chunk 4 of 20:
Chunk 4 already processed!
Processing chunk 5 of 20:
Chunk 5 already processed!
Processing chunk 6 of 20:
Chunk 6 already processed!
Processing chunk 7 of 20:
Chunk 7 already processed!
Processing chunk 8 of 20:
Chunk 8 already processed!
Processing chunk 9 of 20:
Chunk 9 already processed!
Processing chunk 10 of 20:
Chunk 10 already processed!
Processing chunk 11 of 20:
Chunk 11 already processed!
Processing chunk 12 of 20:
Chunk 12 already processed!
Processing chunk 13 of 20:
Chunk 13 already processed!
Processing chunk 14 of 20:
Chunk 14 already processed!
Processing chunk 15 of 20:
Chunk 15 already processed!
Processing chunk 16 of 20:
Chunk 16 already processed!
Processing chunk 17 of 20:
Chunk 17 already processed!
Processing chunk 18 of 20:
Chunk 18 already processed!
Processing chunk 19 of 20:
C