## Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import os

#### Notes for os module usage:
```
>>> os.getcwd()
'/home/stephen/'
>>> os.chdir('/home/stephen/.../')
>>> os.listdir()
['.bashrc', ...]
```

In [2]:
os.getcwd()

'/home/stephen/Dropbox/General/Projects/Thesis/code/clean-data'

In [3]:
DATASET_PATH = "/home/stephen/Dropbox/General/Projects/Thesis/data/"

FOX = "fox-politics/"
VOX = "vox-politics/"
PBS = "pbs-politics/"

## Read Fox News

In [4]:
os.chdir(DATASET_PATH + FOX)

In [5]:
files_to_read = os.listdir()
print(f"There are {len(files_to_read)} articles")
files_to_read[:10]

There are 1024 articles


['fox_politics_166.txt',
 'fox_politics_390.txt',
 'fox_politics_423.txt',
 'fox_politics_102.txt',
 'fox_politics_492.txt',
 'fox_politics_554.txt',
 'fox_politics_490.txt',
 'fox_politics_590.txt',
 'fox_politics_1.txt',
 'fox_politics_971.txt']

In [6]:
articles = []
sources = []
ids = []
for f in tqdm(files_to_read): 
    if '.txt' in f: 
        article_id = f[:-4]
        with open(f, 'r') as _file: 
            txt = _file.read()
        articles.append(txt)
        sources.append('Fox')
        ids.append(article_id)


100%|██████████| 1024/1024 [00:00<00:00, 16133.94it/s]


In [7]:
fox_data = {"article id":ids, "source":sources, "article":articles}

In [8]:
fox_df = pd.DataFrame(fox_data)
fox_df.tail(2)

Unnamed: 0,article id,source,article
1022,fox_politics_304,Fox,Video\nWhat is the 'Green New Deal' proposal d...
1023,fox_politics_402,Fox,The film “Black Panther” depicted conservative...


## Read Vox

In [9]:
os.chdir(DATASET_PATH + VOX)

In [10]:
files_to_read = os.listdir()
print(f"There are {len(files_to_read)} articles")
files_to_read[:5]

There are 2001 articles


['vox_politics_396.txt',
 'vox_politics_372.txt',
 'vox_politics_602.txt',
 'vox_politics_1198.txt',
 'vox_politics_682.txt']

In [11]:
articles = []
sources = []
ids = []
for f in tqdm(files_to_read): 
    if '.txt' in f and ('urls' not in f): 
        article_id = f[:-4]
        with open(f, 'r') as _file: 
            txt = _file.read()
        articles.append(txt)
        sources.append('Vox')
        ids.append(article_id)

100%|██████████| 2001/2001 [00:00<00:00, 20481.64it/s]


In [12]:
vox_data = {"article id":ids, "source":sources, "article":articles}

In [13]:
vox_df = pd.DataFrame(vox_data)
vox_df.head(2)

Unnamed: 0,article id,source,article
0,vox_politics_396,Vox,Senate Republicans on Thursday revealed the Be...
1,vox_politics_372,Vox,"“New York will be destroyed,” the state’s Gov...."


## Read PBS

In [14]:
os.chdir(DATASET_PATH + PBS)

In [15]:
# there is a coding error here 
# I accidently saved the pbs articles in a vox tag

files_to_read = os.listdir()
print(f"There are {len(files_to_read)} articles")
files_to_read[:5]

There are 1753 articles


['vox_politics_396.txt',
 'vox_politics_372.txt',
 'vox_politics_602.txt',
 'vox_politics_1198.txt',
 'vox_politics_682.txt']

In [16]:
articles = []
sources = []
ids = []
for f in tqdm(files_to_read): 
    if '.txt' in f and ('urls' not in f): 
        article_id = f[:-4].replace('vox', 'pbs')
        with open(f, 'r') as _file: 
            txt = _file.read()
        articles.append(txt)
        sources.append('PBS')
        ids.append(article_id)

100%|██████████| 1753/1753 [00:00<00:00, 19045.86it/s]


In [17]:
pbs_data = {"article id":ids, "source":sources, "article":articles}

In [18]:
pbs_df = pd.DataFrame(pbs_data)
pbs_df.head(2)

Unnamed: 0,article id,source,article
0,pbs_politics_396,PBS,President Donald Trump’s longtime personal law...
1,pbs_politics_372,PBS,WASHINGTON — Facing a midnight deadline to avo...


## Merge DataFrames

In [26]:
df = pd.DataFrame()
df = df.append(fox_df)

In [27]:
df = df.append(vox_df)
df = df.append(pbs_df)

In [28]:
df.describe()

Unnamed: 0,article id,source,article
count,5024,5024,5024
unique,5024,3,1689
top,pbs_politics_1131,Vox,"Part of The 2018 midterm elections, explained"
freq,1,2000,304


## Dump to CSV for Easy Read

In [29]:
os.chdir(DATASET_PATH)
df.to_csv('articles.csv', sep='|')