# Purpose of notebook: to make txt files with sources and text body into csv files and merge them 

In [15]:
# import/load packages

## install
%pip install pandas
%pip install chardet
%pip install cchardet


## import
import os
import pandas as pd
import chardet
import cchardet
import re
from bs4 import BeautifulSoup

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## From TXT to CSV (+ global preprocessing) - sources.txt + text.txt


### sources.txt

In [16]:
# define file path
file_path = "/work/Bachelor/sample_data/original_sample/sources.txt"

# define column names
column_names = ["textID", "words", "date", "country", "source", "url", "title"]

# read the file into DataFrame and skip the first two lines
df_sources_original_sample = pd.read_csv(
    file_path,
    delimiter='\t',
    encoding='ISO-8859-1',
    skiprows=2,  # skip the first two lines (header and separator line)
    names=column_names  # assign new column names while reading the file
)

df_sources_original_sample

Unnamed: 0,textID,words,date,country,source,url,title
0,11241,397,13-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H..."
1,11242,757,13-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...
2,11243,755,13-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant
3,11244,1677,13-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...
4,21242,794,13-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...
...,...,...,...,...,...,...,...
2955,15641240,332,16-11-29,US,Business Wire (press release),http://www.businesswire.com/news/home/20161129...,Jungle Disk Appoints Matt Bradley to Board of ...
2956,15641241,226,16-11-29,US,Fox News,http://www.foxnews.com/politics/2016/11/29/dep...,Department of Homeland Security overwhelmed by...
2957,15641242,230,16-11-29,US,Firstcoastnews.com,http://www.firstcoastnews.com/news/fire-surrou...,"Flames surround Ripley's Aquarium, thousands o..."
2958,15641243,193,16-11-29,US,Entertainment Weekly,http://www.ew.com/article/2016/11/29/alexis-bl...,Alexis Bledel is working on a Sisterhood of th...


### text.txt

In [17]:
# define file path
file_path = "/work/Bachelor/sample_data/original_sample/text.txt"

# open the file and read its contents into a string
with open(file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()  # Read all text data from the file

sample = raw_text
sample = re.sub(" ([.,?!':])", r"\1", sample) # punctuation characters are all needlessly preceded by a space, remove that space
sample = re.sub("@ @ @ @ @ @ @ @ @ @", "CENSORED", sample) # replace the keyword chosen to mark words or entities that have been censored

# Step 1: Split the text into articles based on '@@' markers
article_ids = re.findall(r"@@(\d+)", sample) # extract list of all article IDs by matching pattern of any sequence of digits following exactly two @
articles = re.split(r'"?@@\d+ ', sample)[1:] # split articles on article IDs, i.e. a sequence of digits preceded by exactly two @s and sometimes a quotation mark preceding that
articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles] # for each article, skip to the first paragraph tag to exclude heading/title text, then replace subsequent markers for new paragraphs with line breaks. OBS maybe 'words' column contains incorrect n words because of this. 

print(len(article_ids), len(articles)) # check that we found equally many IDs and article bodies

df_articles_original_sample = pd.DataFrame(data = dict(textID = article_ids, body = articles)) # create dataframe with extracted IDs and corresponding article contents
df_articles_original_sample["textID"] = df_articles_original_sample["textID"].astype(int) # cast textIDs to int to enable merging with `sources` dataframe
df_articles_original_sample.sample(10) # sample ten random rows from the dataframe to examine whether things look as expected


2914 2914


Unnamed: 0,textID,body
2360,10351242,"THE police authorities have arrested four "" dr..."
1553,4601243,This is the first time we've seen this prototy...
1138,3321243,"Agro-tourism, agriculture education in schools..."
1948,8301241,Rawalpindi -- Punjab Governor Malik Muhammad R...
1133,3311242,Novak Djokovic lost a point in bizarre fashion...
1862,7851240,"Words by Mad Dog BradleyMarch 23, 2016\nProduc..."
604,1661244,"Lumosity.com, the leading online provider of g..."
2408,13671243,Mamadou Sakho and Divock Origi of Liverpool du...
511,1411241,"If successful, England are likely to have to p..."
2785,14891240,The Movement for the Actualization of the Sove...


In [18]:
joined_original_sample = df_sources_original_sample.merge(df_articles_original_sample, on = "textID", how = "right") # join sources dataframe with their corresponding article content using the textIDs, keeping only the articles where source information was available
joined_original_sample

Unnamed: 0,textID,words,date,country,source,url,title,body
0,11241,397.0,13-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-the-warrio...,"Author of The Warriors, Cult Film Adapted to H...","Sol Yurick, the writer whose 1965 novel "" The ..."
1,11242,757.0,13-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-what-they-...,That's What They Say: Dialect Society chooses ...,"For this week's edition of "" That's What They ..."
2,11243,755.0,13-01-06,US,New York Daily News,http://www.nydailynews.com/life-style/eats/bes...,Best of New York: Croissant,French Tart chef Laurent Chavenet claims to wo...
3,11244,1677.0,13-01-06,US,OregonLive.com,http://www.oregonlive.com/performance/index.ss...,Reflecting on a quarter-century of growth in P...,The highlight of the Portland Center for Perfo...
4,21242,794.0,13-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/01/ask-ars...,Ask Ars: Does Facebook auto-delete content aft...,"In 1998, Ask Ars was an early feature of the n..."
...,...,...,...,...,...,...,...,...
2909,15281244,405.0,16-11-08,SG,Catch News,http://www.catchnews.com/india-news/did-modi-s...,Did Modi silently visit Kashmir? Govts PR arm ...,Did you know that Prime Minister Narendra Modi...
2910,15291240,393.0,16-11-08,CA,Edmonton Journal,http://edmontonjournal.com/news/local-news/cou...,Council votes for $30 per year Edmonton back a...,Council voted Tuesday to support a new $30 per...
2911,15291242,1000.0,16-11-08,CA,Timmins Press,http://www.timminspress.com/2016/11/08/5m-fund...,$5M funding boost for six projects in Timmins,Minister of the Status of Women Patty Hajdu ( ...
2912,15291243,657.0,16-11-08,CA,Windsor Star,http://windsorstar.com/news/local-news/pelissi...,Pelissier Street garage main floor to become p...,A closed-door council meeting that reversed th...


### It works!!
### OBS there are fewer rows in text (2,914) than in sources (2,960), which can be caused by a) text isn't split correctly, b) sources are simply missing from the dataset hence fewer rows. Consider investigating this.