In [1]:
from nltk.tokenize import word_tokenize
import re
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import nltk
from nltk.corpus import stopwords
from cleantext import clean
import os

In [2]:
filename = '995,000_rows.csv'
#filename = 'news_sample.csv'
chunk_size = 50000
df = pd.read_csv(filename, chunksize=chunk_size, usecols=['content', 'type', 'title', 'domain', 'authors', 'meta_keywords', 'authors'])
output_dir = 'data/chunks'
os.makedirs(output_dir, exist_ok=True)

In [3]:
stop_words = set(stopwords.words('english'))
stemmer = nltk.PorterStemmer()

def clean_data(x):
    x = str(x)
    return clean(x,
                 lower=True,
                 no_line_breaks=True,
                 no_urls=True,
                 no_emails=True,
                 no_numbers=True,
                 no_punct=True,
                 no_currency_symbols=True,
                 normalize_whitespace=True,
                 replace_with_currency_symbol="currencytoken",
                 replace_with_url="urltoken",
                 replace_with_email="emailtoken",
                 replace_with_number="numtoken",
                 replace_with_punct="",
                 lang="en")

def process_data(x):
    return [stemmer.stem(token) for token in word_tokenize(x) if token not in stop_words]

i = 1

for chunk in df: 
    print(f"Processing chunk {i}...", end = '\r', flush=True)
    #Check if chunk parquet already exists
    try:
        pq.read_table(f"{output_dir}/chunk_{i}.parquet").to_pandas()
        print(f"Chunk {i} already processed!", flush=True)
        i += 1
        continue
    except:
        pass

    #Drop rows with no content or type
    chunk = chunk.dropna(subset=['content', 'type'])

    #Drop that one weird row
    chunk = chunk[chunk['type'] != '2018-02-10 13:43:39.521661']

    #Cleaning data
    chunk['content'] = chunk['content'].transform(clean_data)
    chunk['title'] = chunk['title'].transform(clean_data)
    chunk['authors'] = chunk['authors'].transform(clean_data)
    chunk['meta_keywords'] = chunk['meta_keywords'].transform(clean_data)

    #Tokenizing and stemming data
    chunk['content'] = chunk['content'].transform(process_data)
    chunk['title'] = chunk['title'].transform(process_data)
    chunk['meta_keywords'] = chunk['meta_keywords'].transform(process_data)

    chunk['authors'] = chunk['authors'].transform(word_tokenize) #Not stemming authors
    
    #Calculating article length, distinct words, and number of distinct words.
    chunk['length'] = chunk['content'].apply(len)
    chunk['distinct_words'] = chunk['content'].apply(set)
    chunk['length_distinct_words'] = chunk['distinct_words'].apply(len)

    #Dropping rows with length 0 (after cleaning)
    chunk = chunk[chunk['length'] != 0]

    #Assigning group based on article type
    chunk['group'] = chunk['type'].apply(lambda x: 1 if x in ['fake', 'satire', 'bias', 'conspiracy', 'junksci', 'hate'] else 0)

    chunk.to_parquet(f"{output_dir}/chunk_{i}.parquet")
    i += 1


Processing chunk 20...

In [None]:
bbc_df = pd.read_csv('bbc_news.csv', usecols=['article_text', 'title', 'authors'])
bbc_df['content'] = bbc_df['article_text']
bbc_df['domain'] = 'bbc.com'
bbc_df['meta_keywords'] = ''
bbc_df['type'] = 'reliable'
bbc_df['group'] = 0
bbc_df.drop(columns=['article_text'], inplace=True)

bbc_df['content'] = bbc_df['content'].transform(clean_data)
bbc_df['title'] = bbc_df['title'].transform(clean_data)
bbc_df['authors'] = bbc_df['authors'].transform(clean_data)

bbc_df['content'] = bbc_df['content'].transform(process_data)
bbc_df['title'] = bbc_df['title'].transform(process_data)
bbc_df['meta_keywords'] = bbc_df['meta_keywords'].transform(process_data)
bbc_df['authors'] = bbc_df['authors'].transform(word_tokenize)

np.int64(0)