# Cleaning the Articles
- *Author*: Juan Cabanela
- *Start Date*: December 2, 2021

## Requirements

Requires the following python libraries:
- pandas
- numpy

This script will process the articles through the same cleaning and TF-IDF vectorization as the original data, saving the results for use by others.

## History
**December 2, 2021**: Initial version of cleaning code.  It dealt with a couple (minor) issues in how `articles.csv` was structured, notably that some of the strings were stored as bytestrings and misinterpreted when read in from csv.

In [9]:
import pandas as pd
import numpy as np
import pickle
import pathlib
import re
import ast

##
## Define functions
##


def content_cleaner(row):
    # Processes the row content through the cleaner
    content = row.content
    return string_cleaner(content)


def title_cleaner(row):
    # Processes the row content through the cleaner
    title = row.title
    return string_cleaner(str(title)) # This became necesary because some titles ended up as floats!?!


def string_cleaner(stuff):
    # This function takes the input string and removes line feed and space runs

    # Remove line feeds and space runs
    stuff = stuff.replace('\n',' ')
    stuff = re.sub(r"\s+", " ", stuff)  # Remove multiple space runs

	# Remove last word since it is likely to be a partial word anyway
    last_space_idx = stuff.rfind(" ")
    stuff = stuff[:last_space_idx]
    return stuff.strip()


def content_second_scrub(row):
    # Processes the row content through stop word remover
    content = row.content
    return second_scrub(content)


def title_second_scrub(row):
    # Processes the row content through stop word remover
    title = row.title
    return second_scrub(str(title)) # This became necesary because some titles ended up as floats!?!


def second_scrub(stuff):
    # Remove stop words and punctuation and make entire text lowercase
    stuff = stuff.lower()
    stuff = ''.join(filter(lambda c: c not in punctuation, stuff))

    # Remove Stop Words
    newstuff = ""
    for word in stuff.strip().split(" "):
        if word not in ENGLISH_STOP_WORDS:
            newstuff += f"{word} "
    del stuff  # Release memory (just in case)
    return newstuff.strip()


def bytestring_cleaner(row):
    """ Cleans up bytestrings stored as strings in CSV, converting line feeds from Windows to 
        Unix linefeeds.
    """

    # Processes the row content through the bytestring cleaner
    content = row.content
    content = parse_bytes(content)

    return content.replace("\r\n", "\n")


def parse_bytes(field):
    """ Convert string represented in Python byte-string literal b'' syntax into
        a decoded character string - otherwise return it unchanged.

        Grabbed from https://stackoverflow.com/questions/47741235/how-to-read-bytes-object-from-csv
    """
    result = field
    try:
        result = ast.literal_eval(field)
    finally:
        return result.decode() if isinstance(result, bytes) else field

##
## Define constants
##
DEBUG = False

# List of English stopwords (grabbed from https://gist.github.com/ethen8181/d57e762f81aa643744c2ffba5688d33a and used in scikit-learn
# and nltk)
ENGLISH_STOP_WORDS=['a','about','above','across','after','afterwards','again','against',
	'ain','all','almost','alone','along','already','also','although','always','am',
	'among','amongst','amoungst','amount','an','and','another','any','anyhow',
	'anyone','anything','anyway','anywhere','are','aren','around','as','at','back',
	'be','became','because','become','becomes','becoming','been','before','beforehand',
	'behind','being','below','beside','besides','between','beyond','bill','both',
	'bottom','but','by','call','can','cannot','cant','co','con','could','couldn',
	'couldnt','cry','d','de','describe','detail','did','didn','do','does','doesn',
	'doing','don','done','down','due','during','each','eg','eight','either','eleven',
	'else','elsewhere','empty','enough','etc','even','ever','every','everyone',
	'everything','everywhere','except','few','fifteen','fify','fill','find','fire',
	'first','five','for','former','formerly','forty','found','four','from','front',
	'full','further','get','give','go','had','hadn','has','hasn','hasnt','have',
	'haven','having','he','hence','her','here','hereafter','hereby','herein','hereupon',
	'hers','herself','him','himself','his','how','however','hundred','i','ie','if','in',
	'inc','indeed','interest','into','is','isn','it','its','itself','just','keep','last',
	'latter','latterly','least','less','ll','ltd','m','ma','made','many','may','me',
	'meanwhile','might','mightn','mill','mine','more','moreover','most','mostly','move',
	'much','must','mustn','my','myself','name','namely','needn','neither','never',
	'nevertheless','next','nine','no','nobody','none','noone','nor','not','nothing',
	'now','nowhere','o','of','off','often','on','once','one','only','onto','or','other',
	'others','otherwise','our','ours','ourselves','out','over','own','part','per',
	'perhaps','please','put','rather','re','s','same','see','seem','seemed','seeming',
	'seems','serious','several','shan','she','should','shouldn','show','side','since',
	'sincere','six','sixty','so','some','somehow','someone','something','sometime',
	'sometimes','somewhere','still','such','system','t','take','ten','than','that',
	'the','their','theirs','them','themselves','then','thence','there','thereafter',
	'thereby','therefore','therein','thereupon','these','they','thick','thin','third',
	'this','those','though','three','through','throughout','thru','thus','to',
	'together','too','top','toward','towards','twelve','twenty','two','un','under',
	'until','up','upon','us','ve','very','via','was','wasn','we','well','were',
	'weren','what','whatever','when','whence','whenever','where','whereafter',
	'whereas','whereby','wherein','whereupon','wherever','whether','which','while',
	'whither','who','whoever','whole','whom','whose','why','will','with','within',
	'without','won','would','wouldn','y','yet','you','your','yours','yourself',
	'yourselves']

# Define punctuation to purge
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’–“”'

# Directory containing chunked data
data_dir = "./"
articles_csv = f"{data_dir}articles.csv"
var_dir = "./Variables/"
cols_to_dump = ['domain', 'story_label', 'title', 'content']

In [10]:
# Load the target articles from the CSV file
articles_df = pd.read_csv(articles_csv)

# Change column names
col_dict = {"Source Name": "domain", "Related Story": "story_label", "Title":"title", "Text":"content", "URL":"url"}
articles_df.rename(columns=col_dict, inplace=True)

# Make source names into proper domain names
domain_map = {'CNN':'cnn.com', 'Fox News':'foxnews.com', 'NBC':'nbcnews.com', 'New York Post':'nypost.com', 'The Wall Street Journal':'wsj.com'}
articles_df['domain'].replace(to_replace=domain_map,inplace=True)

#Convert mal-stored bytestrings to proper strings
articles_df["content"] = articles_df.apply(bytestring_cleaner, axis=1)

# Reduce content to first 800 characters
articles_df["content"] = articles_df["content"].str[:800]
# clean all the article content
articles_df['content'] = articles_df.apply(content_cleaner, axis=1)
# clean all the title content
articles_df['title'] = articles_df.apply(title_cleaner, axis=1)

# Dump articles with context before stripping the stop words (and thus wiping context)
fname_fullcontext_cleaned = f"{data_dir}articles_fullcontext_cleaned.csv"
print(f"Creating {fname_fullcontext_cleaned} ... ")
cols_to_dump = ['domain', 'story_label', 'title', 'content']
articles_df[cols_to_dump].to_csv(fname_fullcontext_cleaned, index=False)


Creating ./articles_fullcontext_cleaned.csv ... 


In [11]:

# Create a separate cleaned pandas dataframe
cleaned_df = articles_df.copy(deep=True)
# clean all the article stopwords, remove punctuation, and convert to lowercase
cleaned_df['content'] = cleaned_df.apply(content_second_scrub, axis=1)
# clean all the title stopwords, remove punctuation, and convert to lowercase
cleaned_df['title'] = cleaned_df.apply(title_second_scrub, axis=1)

fname_cleaned = f"{data_dir}articles_cleaned.csv"
print(f"Creating {fname_cleaned} ... ", end='')
cleaned_df[cols_to_dump].to_csv(fname_cleaned, index=False)



Creating ./articles_cleaned.csv ... 

## Apply TF-IDF Vectorizer that was used with FakeNewsCorpus

We will read in an apply the same TF-IDF vectorizer that was developed with the FakeNewsCorpus (in `ProjectDataTFIDFTokenization.ipynb` notebook).  Apply it to the complete full_context article data.

In [12]:
# Number of words kept by TF-IDF tokenizer
words2keep = 100000

# Name of articles tokenized content file
tokenized_pickle_file = f"{var_dir}articles_fullcontext_tokenized_{words2keep:06d}.p"

# Set up path to pickle file of the vectorizer
vectorizer_pickle_file = f"{var_dir}fullcontext_vectorizer_{words2keep:06d}.p"
vectorizer_pickle_path = pathlib.Path(vectorizer_pickle_file)
# If pickle files exist, avoid reprocessing and just load them
if (vectorizer_pickle_path.is_file()  ):
    print(f"Loading previously pickled vectorizer (about {vectorizer_pickle_path.stat().st_size/1024**3:0.2f} GB)")
    vectorizer = pickle.load( open( vectorizer_pickle_file, "rb" ) )

    # Convert content to strings then apply transform from TFIDTVectorizer
    corpus = articles_df['content'].apply(lambda x: np.str_(x))
    tokenized_content = vectorizer.transform(corpus)

    print(f"Creating {tokenized_pickle_file} ... ")
    pickle.dump( tokenized_content, open(tokenized_pickle_file, "wb" ) )
else:
    print(f"PROCEED NO FURTHER until you have created vectorizer and pickled it as {vectorizer_pickle_file}.")
    print("- You can create the appropriate TF-IDF vectorizer using ProjectDataTFIDFtokenization.ipynb.")

Loading previously pickled vectorizer (about 0.02 GB)
Creating ./Variables/articles_fullcontext_tokenized_100000.p ... 
