In [1]:
import pandas as pd

In [8]:
# Reading the dataset
dataset = pd.read_csv('dsjVoxArticles.tsv', delimiter = '\t', encoding = 'utf-8')

print(dataset.head(10))
print(dataset.shape)

                                               title               author  \
0  Bitcoin is down 60 percent this year. Here's w...       Timothy B. Lee   
1  6 health problems marijuana could treat better...         German Lopez   
2  9 charts that explain the history of global we...     Matthew Yglesias   
3  Remember when legal marijuana was going to sen...         German Lopez   
4  Obamacare succeeded for one simple reason: it'...          Sarah Kliff   
5  The best Obamacare data comes from a home offi...          Sarah Kliff   
6  The Republicansâ€™ plan to cut Medicaid, expla...          Sarah Kliff   
7  Obama is deporting more immigrants than any pr...            Dara Lind   
8        9 things you didn't know about income taxes  Danielle Kurtzleben   
9  Why are scientists trying to map every single ...       Susannah Locke   

             category       published_date           updated_on  \
0  Business & Finance  2014-03-31 14:01:30  2014-12-16 16:37:36   
1        War on Dr

In [15]:
# Removing NaN values

ds_cleaned = dataset.dropna(subset = ['title', 'category','blurb','body','author'])
print(ds_cleaned.shape)

# Resetting the index
ds_cleaned.reset_index(drop = True, inplace = True)

body = ds_cleaned['body']
title = ds_cleaned['title']
category = ds_cleaned['category']
blurb = ds_cleaned['blurb']
author = ds_cleaned['author']

print("Body row count: ", len(body))
print("Title row count: ", len(title))
print("Category row count: ", len(category))
print("Blurb row count: ", len(blurb))
print("Author row count: ", len(author))

(23012, 8)
Body row count:  23012
Title row count:  23012
Category row count:  23012
Blurb row count:  23012
Author row count:  23012


In [18]:
# Removing non ascii characters
from bs4 import BeautifulSoup

# Defining a function
def non_ascii_removal(string):
    return ''.join([i if ord(i) < 128 else ' ' for i in string])

In [24]:
ds_body=[]
ds_title=[]
ds_blurb=[]
ds_category=[]
ds_author=[]

for i in range(0,len(body)):
    if isinstance(body[i],str):
        soup = BeautifulSoup(body[i])
        text = soup.get_text()
        cleanedText = text.replace(r'\n',"")
        data = non_ascii_removal(cleanedText)
        ds_body.append(data)
       
        cleanedTitle=non_ascii_removal(title[i])
        ds_title.append(cleanedTitle)
        
        cleaned_blurb=non_ascii_removal(blurb[i])
        ds_blurb.append(cleaned_blurb)
        
        cleaned_category=non_ascii_removal(category[i])  
        ds_category.append(cleaned_category)
        
        cleaned_author=non_ascii_removal(author[i])  
        ds_author.append(cleaned_author)
    else:
        print(i,':',)
        print(i,':',body[i])
        ds_body.append('')

print('done')

done


In [26]:
print("Body row count: ", len(ds_body))
print("Title row count: ", len(ds_title))
print("Category row count: ", len(ds_category))
print("Blurb row count: ", len(ds_blurb))
print("Author row count: ", len(ds_author))

Body row count:  23012
Title row count:  23012
Category row count:  23012
Blurb row count:  23012
Author row count:  23012


In [31]:
# Generating cleaned dataset csv

ds_cleaned.loc[:, ('body')] = ds_body
ds_cleaned.loc[:, ('title')] = ds_title
ds_cleaned.loc[:, ('blurb')] = ds_blurb
ds_cleaned.loc[:, ('category')] = ds_category
ds_cleaned.loc[:, ('author')] = ds_author
ds_cleaned.head()

Unnamed: 0,title,author,category,published_date,updated_on,slug,blurb,body
0,Bitcoin is down 60 percent this year. Here's w...,Timothy B. Lee,Business & Finance,2014-03-31 14:01:30,2014-12-16 16:37:36,http://www.vox.com/2014/3/31/5557170/bitcoin-b...,Bitcoins have lost more than 60 percent of the...,The markets haven't been kind to Bitcoin in 20...
1,6 health problems marijuana could treat better...,German Lopez,War on Drugs,2014-03-31 15:44:21,2014-11-17 00:20:33,http://www.vox.com/2014/3/31/5557700/six-probl...,Medical marijuana could fill gaps that current...,Twenty states have so far legalized the medica...
2,9 charts that explain the history of global we...,Matthew Yglesias,Business & Finance,2014-04-10 13:30:01,2014-12-16 15:47:02,http://www.vox.com/2014/4/10/5561608/9-charts-...,These nine charts from Thomas Piketty's new bo...,Thomas Piketty's book Capital in the 21st Cent...
3,Remember when legal marijuana was going to sen...,German Lopez,Criminal Justice,2014-04-03 23:25:55,2014-05-06 21:58:42,http://www.vox.com/2014/4/3/5563134/marijuana-...,"Three months after legalizing marijuana, Denve...",When Colorado legalized recreational marijuana...
4,Obamacare succeeded for one simple reason: it'...,Sarah Kliff,Health Care,2014-04-01 20:26:14,2014-11-18 15:09:14,http://www.vox.com/2014/4/1/5570780/the-two-re...,"After a catastrophic launch, Obamacare still s...",There's a very simple reason that Obamacare hi...


In [35]:
# Generating the cleaned dataset file

ds_cleaned.to_csv('Cleaned_Dataset.tsv', sep = '\t', index = False)

In [33]:
# Determining the file size (Rows and columns)

print(ds_cleaned.shape)

(23012, 8)
