In [14]:
from nltk.tokenize import word_tokenize
import re
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import nltk
from nltk.corpus import stopwords
from cleantext import clean
import os

In [15]:
filename = '995,000_rows.csv'
#filename = 'news_sample.csv'
chunk_size = 50000
df = pd.read_csv(filename, chunksize=chunk_size, usecols=['content', 'type', 'title', 'domain', 'authors', 'meta_keywords'])
output_dir = 'data/chunks'
os.makedirs(output_dir, exist_ok=True)

In [16]:
date_exp =  {   "year_mm_dd" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),   
                "dd_mm_year" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "mm_dd_year" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "year_mm_dd_time" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "dd_mm_year_time" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "mm_dd_year_time" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                }

In [17]:
stop_words = set(stopwords.words('english'))
stemmer = nltk.PorterStemmer()

def token_count(x, token):
    count = 0
    for word in x:
        if word == token:
            count += 1
    return count

i = 1

for chunk in df: 
    print(f"Processing chunk {i}...", end = '\r', flush=True)
    #Check if chunk parquet already exists
    try:
        pq.read_table(f"{output_dir}/chunk_{i}.parquet").to_pandas()
        print(f"Chunk {i} already processed!", flush=True)
        i += 1
        continue
    except:
        pass

    #Drop empty rows
    chunk = chunk.dropna(subset=['content', 'type'])

    #Drop that one weird row
    chunk = chunk[chunk['type'] != '2018-02-10 13:43:39.521661']

    #Processing content column. Cleaning, tokenizing and stemming.
    def date_replace(x):
        for exp in date_exp.values():
            x = exp.sub('datetoken', x)
        return x

    chunk['content'] = chunk['content'].transform(date_replace)

    chunk['content'] = chunk['content'].transform(lambda x: clean(x,
                                        lower=True,
                                        no_line_breaks=True,
                                        no_urls=True,
                                        no_emails=True,
                                        no_numbers=True,
                                        no_punct=True,
                                        no_currency_symbols=True,
                                        normalize_whitespace=True,
                                        replace_with_currency_symbol="currencytoken",
                                        replace_with_url="urltoken",
                                        replace_with_email="emailtoken",
                                        replace_with_number="numtoken",
                                        replace_with_punct="",
                                        lang="en")
                                        )
    chunk['title'] = chunk['title'].transform(lambda x: clean(x,
                                        lower=True,
                                        no_line_breaks=True,
                                        no_urls=True,
                                        no_emails=True,
                                        no_numbers=True,
                                        no_punct=True,
                                        no_currency_symbols=True,
                                        normalize_whitespace=True,
                                        replace_with_currency_symbol="currencytoken",
                                        replace_with_url="urltoken",
                                        replace_with_email="emailtoken",
                                        replace_with_number="numtoken",
                                        replace_with_punct="",
                                        lang="en")
                                        )
    chunk['content'] = chunk['content'].transform(lambda x: [stemmer.stem(token) for token in word_tokenize(x) if token not in stop_words])
    chunk['title'] = chunk['title'].transform(lambda x: [stemmer.stem(token) for token in word_tokenize(x) if token not in stop_words])

    #Calculating number of numtokens, urltokens, emailtokens and datetokens
    chunk['num_count'] = chunk['content'].transform(lambda x: token_count(x, 'numtoken'))
    chunk['url_count'] = chunk['content'].transform(lambda x: token_count(x, 'urltoken'))
    chunk['email_count'] = chunk['content'].transform(lambda x: token_count(x, 'emailtoken'))
    chunk['date_count'] = chunk['content'].transform(lambda x: token_count(x, 'datetoken'))
    
    #Calculating article length, distinct words, and number of distinct words.
    chunk['length'] = chunk['content'].apply(len)
    chunk['distinct_words'] = chunk['content'].apply(set)
    chunk['length_distinct_words'] = chunk['distinct_words'].apply(len)

    #Dropping rows with length 0 (cleaning)
    chunk = chunk[chunk['length'] != 0]

    #Assigning group based on article type
    chunk['group'] = chunk['type'].apply(lambda x: 1 if x in ['fake', 'satire', 'bias', 'conspiracy', 'junksci', 'hate'] else 0)

    chunk.to_parquet(f"{output_dir}/chunk_{i}.parquet")
    i += 1


Processing chunk 20...