# Processing steps

In [1]:
import pandas as pd
import string
import os
import sys
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

sys.path.append("./datacleaning")
from load import load_years
from data_cleaning import remove_stopwords, remove_punctuation
from lemmatization import lemmatizer

sys.path.pop()
sys.path.append("./topicmodelling")
from ipynb.fs.defs.topic_modelling import print_top_words, recommendation, load_all_files

pd.options.display.max_colwidth = 200

## Load.py

Load.py takes [all-the-news-2-1.csv created by Andrew Thompson (2022)](https://components.one/datasets/all-the-news-2-news-articles-dataset/), and splits it into 5 seperate CSV files based on the publishing year of each article. 

The all-the-news-2-1.csv as the main file has a size of 8.8 GB resulting in long loading times. To address this, the file will be split into individual sub-files based on the year of publication. Each sub-file will contain roughly 1/5th of the data, making the processing faster and allowing for better compartmentalization.

The process can be summarized as follows:

* Read the main CSV file using the pandas library.
* Split the data into separate DataFrames based on the publishing year, ranging from 2016 to 2020.
* Write the seperate Dataframes to their respective csv files.

In [2]:
main_file_path = "./csv/all-the-news-2-1.csv"
years = load_years("years.txt")

def sub_file_path(year: int, version: int=0):
    """Return path to sub file.
     
     Args:
     	 year (int): year of file name.
     	 version (str): version to use for sub file. Defaults to first version.
     
     Returns: 
     	 path to sub file for given year and version.
    """
    suffix: str = ""
    if version != 0:
        suffix = "_0" + str(version)
    return f"./csv/{year}{suffix}.csv"


def convert_to_gb(value: int):
    return round(value / (1024 ** 3), 2)


def file_size(year: int, version: int=0) -> int:
    return os.stat(sub_file_path(year, version)).st_size


def load_df_version(version: int=0):
    df = {}
    for year in years:
        df.update({year: pd.read_csv(sub_file_path(year, version), nrows=10)})

    return df

In [3]:
main_files_size = convert_to_gb(os.stat(main_file_path).st_size)

print(f"{main_file_path} size: {main_files_size} GB")

cum_size = 0
for year in years:
    file = file_size(year)
    cum_size += file
    print(f"{sub_file_path(year)} {convert_to_gb(file)} GB")

print(f"Cumulative size: {convert_to_gb(cum_size)} GB")

./csv/all-the-news-2-1.csv size: 8.16 GB
./csv/2016.csv 1.74 GB
./csv/2017.csv 1.86 GB
./csv/2018.csv 1.82 GB
./csv/2019.csv 2.12 GB
./csv/2020.csv 0.63 GB
Cumulative size: 8.16 GB


In [4]:
df_main = pd.read_csv(main_file_path, nrows=10)
df = load_df_version()

display(df_main.head(1))
for year in years:
    display(df[year].head(1))

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of liberal democracy seriously,"This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to developing new ideas and new voices. Imagine you are...",https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs,,Vox


Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
0,2016,2016-12-09 18:31:00,12.0,9,Lee Drutman,We should take concerns about the health of liberal democracy seriously,"This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to developing new ideas and new voices. Imagine you are...",https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs,,Vox


Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
0,2017,2017-03-02 00:00:00,3.0,2,Tim Hume,China is dismissing unfavorable media reports as fake because that's what Trump does,China is dismissing unfavorable media reports as fake news because that’s what Trump does China is dismissing unfavorable media reports as fake news because that’s what Trump does In his short p...,https://news.vice.com/en_us/article/xwvj7j/china-is-dismissing-unfavorable-media-reports-as-fake-because-thats-what-trump-does,,Vice News


Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
0,2018,2018-01-26 00:00:00,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last June, calling it “fake news”. The New York Times...",https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A,Davos,Reuters


Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
0,2019,2019-06-27 00:00:00,6.0,27,,France's Sarkozy reveals his 'Passions' but insists no come-back on cards,"PARIS (Reuters) - Former French president Nicolas Sarkozy published a new memoir on Thursday but was quick to dismiss speculation he might return to politics to rescue his old center-right party, ...",https://www.reuters.com/article/france-politics-sarkozy/frances-sarkozy-reveals-his-passions-but-insists-no-come-back-on-cards-idUSL8N23Y5AS,World News,Reuters


Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
0,2020,2020-01-06 00:00:00,1.0,6,,The Bronx Zoo’s Holiday Lights Festival,"The lowland gorillas at the Bronx Zoo tend to congregate indoors during the winter rather than roam around outside in their forest habitat. But, after sunset, their sculptural counterparts illum...",https://www.newyorker.com/magazine/2020/01/06/the-bronx-zoos-holiday-lights-festival,magazine,New Yorker


## Data_cleaning.py

To prepare the data for the lemmatization process, multiple steps are performed.

* Articles are cast to lowercase.
    - This happens because text containing both upper and lower cases are recognized as different words.
* Commas are removed from article titles and author names.
    - Superfluous commas cause problems reading the articles. By removing them no information is lost, but processing the csv files (and each subsequent step) becauses significantly easier.
* All punctuation is removed from articles.
    - For the same reason as removing commas and casting text to lowercase.
* All stop words are removed from articles.
    - To reduce the processing power requirements for words with no inherent meaning.

### Removed punctuation and stop words:

In [5]:
display(string.punctuation + '"“‘—’”"', stopwords.words("english")[:10])

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~"“‘—’”"'

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### The difference between unprocessed and processed data:

In [6]:
df_cleaned = load_df_version(1)

for year in years:
    display(df[year]['article'].head(1), df_cleaned[year]['article'].head(1))

0    This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to developing new ideas and new voices. Imagine you are...
Name: article, dtype: object

0    post part polyarchy independent blog produced political reform program new america washington think tank devoted developing new ideas new voices imagine otherwise healthy 30something starts feelin...
Name: article, dtype: object

0    China is dismissing unfavorable media reports as fake news because that’s what Trump does  China is dismissing unfavorable media reports as fake news because that’s what Trump does  In his short p...
Name: article, dtype: object

0    china dismissing unfavorable media reports fake news thats trump china dismissing unfavorable media reports fake news thats trump short political career donald trump made habit dismissing unfavora...
Name: article, dtype: object

0    DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last June, calling it “fake news”.  The New York Times...
Name: article, dtype: object

0    davos switzerland reuters us president donald trump denied report friday ordered special counsel robert mueller fired last june calling fake news new york times reported thursday trump backed orde...
Name: article, dtype: object

0    PARIS (Reuters) - Former French president Nicolas Sarkozy published a new memoir on Thursday but was quick to dismiss speculation he might return to politics to rescue his old center-right party, ...
Name: article, dtype: object

0    paris reuters former french president nicolas sarkozy published new memoir thursday quick dismiss speculation might return politics rescue old centerright party shed support since president emmanu...
Name: article, dtype: object

0    The lowland gorillas at the  Bronx Zoo  tend to congregate indoors during the winter rather than roam around outside in their forest habitat. But, after sunset, their sculptural counterparts illum...
Name: article, dtype: object

0    lowland gorillas bronx zoo tend congregate indoors winter rather roam around outside forest habitat sunset sculptural counterparts illuminate zoos grounds fanciful renditions lions giraffes zebras...
Name: article, dtype: object

This notably reduces the size of the file.

In [7]:
def size_comparison(version: int) -> None:
    old_cum, new_cum = 0, 0
    for year in years:
        old_file_size = file_size(year, version - 1)
        new_file_size = file_size(year, version)
        old_cum += old_file_size
        new_cum += new_file_size
        print(f"{sub_file_path(year, version - 1)} was {convert_to_gb(file_size(year))} GB, {sub_file_path(year, version)} is {convert_to_gb(file_size(year, 1))} GB. The size has been decreased to {round((new_file_size/old_file_size * 100), 2)}% of the original file.")

    print("")
    print(f"The cumulative size of the old versions was {convert_to_gb(old_cum)} GB, the new versions has a cumulative size of {convert_to_gb(new_cum)} GB.")

In [8]:
size_comparison(1)

./csv/2016.csv was 1.74 GB, ./csv/2016_01.csv is 1.24 GB. The size has been decreased to 71.57% of the original file.
./csv/2017.csv was 1.86 GB, ./csv/2017_01.csv is 1.33 GB. The size has been decreased to 71.87% of the original file.
./csv/2018.csv was 1.82 GB, ./csv/2018_01.csv is 1.3 GB. The size has been decreased to 71.69% of the original file.
./csv/2019.csv was 2.12 GB, ./csv/2019_01.csv is 1.53 GB. The size has been decreased to 72.22% of the original file.
./csv/2020.csv was 0.63 GB, ./csv/2020_01.csv is 0.45 GB. The size has been decreased to 71.6% of the original file.

The cumulative size of the old versions was 8.16 GB, the new versions has a cumulative size of 5.86 GB.


## Random_sample.py

Due to hardware limitations, executing certain processing-intensive functions cost high amounts of time. The lemmatization function would require 55 hours to completely process all 5 csv files, even after reducing their contents through data_cleaning.py. For this project a random sample of 10% of all articles is therefore taken from from each file and written to a new file.

In [9]:
df_sampled = load_df_version(2)

size_comparison(2)

./csv/2016_01.csv was 1.74 GB, ./csv/2016_02.csv is 1.24 GB. The size has been decreased to 10.0% of the original file.
./csv/2017_01.csv was 1.86 GB, ./csv/2017_02.csv is 1.33 GB. The size has been decreased to 10.06% of the original file.
./csv/2018_01.csv was 1.82 GB, ./csv/2018_02.csv is 1.3 GB. The size has been decreased to 9.99% of the original file.
./csv/2019_01.csv was 2.12 GB, ./csv/2019_02.csv is 1.53 GB. The size has been decreased to 10.01% of the original file.
./csv/2020_01.csv was 0.63 GB, ./csv/2020_02.csv is 0.45 GB. The size has been decreased to 10.21% of the original file.

The cumulative size of the old versions was 5.86 GB, the new versions has a cumulative size of 0.59 GB.


## Lemmatization.py

Lemmatization is a process in which words are reduced to their base dictionary form of a word [Stanford, 2008](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html). This reduces the dimensionality of text, which allows for an easier classification through vectorisation [MathWorks, 2023](https://nl.mathworks.com/discovery/lemmatization.html). It is a very time and power intensive method but results in a trimmed text.



### An example of the lemmatizer.
By a text generated through ChatGPT.

In [10]:
example_text = "The quick brown foxes jumped over the lazy dogs. They were all running and playing in the open field. The foxes' tails were bushy, and the dogs wagged their tails happily. Suddenly, a loud noise startled them, and they quickly scattered in different directions."
example_text_2 = remove_punctuation(example_text)
example_text_3 = remove_stopwords(example_text_2)
lemmatized_text = lemmatizer(example_text_3)

print(f"{example_text}\n{example_text_2}\n{example_text_3}\n{lemmatized_text}")

The quick brown foxes jumped over the lazy dogs. They were all running and playing in the open field. The foxes' tails were bushy, and the dogs wagged their tails happily. Suddenly, a loud noise startled them, and they quickly scattered in different directions.
The quick brown foxes jumped over the lazy dogs They were all running and playing in the open field The foxes tails were bushy and the dogs wagged their tails happily Suddenly a loud noise startled them and they quickly scattered in different directions
The quick brown foxes jumped lazy dogs They running playing open field The foxes tails bushy dogs wagged tails happily Suddenly loud noise startled quickly scattered different directions
the quick brown fox jump lazy dog they run play open field the fox tail bushy dog wag tail happily suddenly loud noise startle quickly scatter different direction


The same is done on the articles stored in the various csv files. Due to the multi-core processing without storing original index values, the order of original has been lost after lemmatization. The articles however have been succesfully lemmatized, further reducing their size.

In [11]:
df_lemmatized = load_df_version(3)
for year in years:
    display(df_lemmatized[year]['article'].head(1))

size_comparison(3)

0    recent rumor potential apple car may massive waste anticipatory energy well direct toward levitate hologrambeame iphone 10 new report accurate despite month month hint movement prompt many tech in...
Name: article, dtype: object

0    singapore may 15 reuter golden agriresource ltds quarterly net profit fall half year ago hit foreign exchangerelate loss largely wipe high revenue strong palm oil price singaporeliste firm monday ...
Name: article, dtype: object

0    washington reuter major washingtonarea airport virginia briefly halt traffic friday high wind prompt evacuation air traffic control tower 30 minute federal aviation administration say washington d...
Name: article, dtype: object

0    peoples bank china approve paypal acquisition 70 equity state gopay guofubao information technology co gopay ltd make paypal first foreign payment platform provide online payment service china gop...
Name: article, dtype: object

0    march 17 reuter icc holdings inc insurance carrier temporarily suspend billing restaurant tavern amid grow covid19 closing icc holdings inc temporarily suspend insurance premium bill 30 day begin ...
Name: article, dtype: object

./csv/2016_02.csv was 1.74 GB, ./csv/2016_03.csv is 1.24 GB. The size has been decreased to 88.56% of the original file.
./csv/2017_02.csv was 1.86 GB, ./csv/2017_03.csv is 1.33 GB. The size has been decreased to 85.81% of the original file.
./csv/2018_02.csv was 1.82 GB, ./csv/2018_03.csv is 1.3 GB. The size has been decreased to 89.99% of the original file.
./csv/2019_02.csv was 2.12 GB, ./csv/2019_03.csv is 1.53 GB. The size has been decreased to 85.87% of the original file.
./csv/2020_02.csv was 0.63 GB, ./csv/2020_03.csv is 0.45 GB. The size has been decreased to 83.41% of the original file.

The cumulative size of the old versions was 0.59 GB, the new versions has a cumulative size of 0.51 GB.


## Vectorizer.py

After lemmatization, the data is vectorized. This was performed through the sklearn Countvectorizer function. This vectorizer works through forming a sparse matrix which is a count of word occurences in a text. This matrix of vectorized text can be processed through a Latent Dirichlet Allocation. By assessing the presence of words topics can be discovered from collections of texts. Let's take a look at an example.

### An Example of how spare matrix vectorization works.

Lemmatized sentences are generated by ChatGPT

In [12]:
lemmatized_sentences = [
    "brown cat with fluffy tail jump over sleep dog in backyard",
    "I enjoy spend my weekend explore new hike trail capture beautiful photograph of nature",
    "sun set in horizon golden hue paint sky create breathtake view",
    "he passionate play piano grace and elegance captivate audience every note",
    "she dream travel world experience different culture taste exotic cuisine",
    "they decide embark on epic journey across vast ocean seek hide treasure ancient relic",
    "we gather around campfire share story laughter under star night sky",
    "it bloom vibrant color fil garden sweet scent attract butterfly bee",
    "he skillful perform intricate dance routine impress judge win heart audience",
    "she passionate pursue dream overcome obstacle achieve great success in career"
]


vectorizer = CountVectorizer()
v = vectorizer.fit(lemmatized_sentences)
vector = vectorizer.transform(lemmatized_sentences)

display(list(v.vocabulary_.items())[:10], vector.toarray())

[('brown', 12),
 ('cat', 18),
 ('with', 104),
 ('fluffy', 37),
 ('tail', 91),
 ('jump', 55),
 ('over', 66),
 ('sleep', 84),
 ('dog', 26),
 ('in', 50)]

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

### How an LDA applied to the vectorization works.

In [13]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(vector)

print_top_words(lda, vectorizer, n_top_words=5)

Topic 0: embark treasure seek journey epic
Topic 1: sky in star under laughter
Topic 2: in she dream passionate great



### Preview of vectorization and LDA on the dataset

In [14]:
lda, data_vectorized, vectorizer, df, doc_topic_dist = load_all_files("topicmodelling/csv", "csv", years)

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.


In [15]:
for year in years:
    print(year)
    print_top_words(lda[year], vectorizer[year], n_top_words=5)

2016
Topic 0: use like make new not
Topic 1: say police court case officer
Topic 2: song music album band record

2017
Topic 0: like not make get one
Topic 1: percent price stock say high
Topic 2: 2017 music song album band

2018
Topic 0: woman man people one not
Topic 1: not do get go like
Topic 2: 2018 get star year not

2019
Topic 0: year growth economy say rate
Topic 1: health say drug care patient
Topic 2: woman film show black story

2020
Topic 0: de la que en el
Topic 1: iran we say iranian united
Topic 2: datum company say business use



In [20]:
recommendation(df, doc_topic_dist, "U.S. manufacturing output rises unexpectedly", 2020, k=10, plot_dna=True)

Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
3,2020,2020-01-17 00:00:00,1.0,17,,U.S. manufacturing output rises unexpectedly,washington reuter we manufacture output rise unexpectedly december drop motor vehicle output outpace increase production durable good food beverage product federal reserve say friday manufacturing...,https://www.reuters.com/article/us-usa-economy-output/us-manufacturing-output-rises-unexpectedly-idUSKBN1ZG1OL,Business News,Reuters


NameError: name 'doc_topic_dist' is not defined