# 1 - Straits Times Headlines Cleaning

This notebook takes in the scraped URLS of all news articles from the sitemap of https://straitstimes.com and processes the data to generate a dataset of news article headlines to train a neural network to generate its own headlines. Also generates a cleaned dataset for EDA.

In [1]:
import pandas as pd
import numpy as np
import json
pd.options.display.max_colwidth = 150

df = pd.read_csv("../data/stsitemap_20191001-093746.csv")
df = df.drop(df.index[0])

In [2]:
df.sample(5)

Unnamed: 0,url,page
226127,https://www.straitstimes.com/lifestyle/arts/tears-that-cracked-the-great-wall,46
398078,https://www.straitstimes.com/world/africa/female-suicide-bomber-in-nigerias-maiduguri-kills-two-near-checkpoint,80
11423,https://www.straitstimes.com/world/ukraine-crisis-russian-navy-blocks-channel-between-crimea-and-russia,3
96742,https://www.straitstimes.com/world/europe/bodies-of-21-migrants-found-after-boats-sink-off-turkey,20
434496,https://www.straitstimes.com/business/banking/hsbc-adding-50-jobs-in-singapore-in-plan-for-asia-retail-wealth-headcount-boost,87


#### Drop URLs not from Straits Times

In [9]:
df.loc[df['url'].str.contains('https://www.straitstimes.com/')==False]

Unnamed: 0,url,page
275922,http://www.straitstimes.com/lifestyle/food/fratelli-at-rws-italian-cuisine-steeped-in-michelin-heritage?xtor=CS12-104-%5BST_Editorial_Tile%5D-%5BN...,56
296276,http://pubads.g.doubleclick.net/gampad/clk?id=4532408159&amp;iu=/5908/project,60
303335,http://pubads.g.doubleclick.net/gampad/clk?id=4551947981&amp;iu=/5908/project,61
316940,http://staff.straitstimes.com/world/unitedstates/gunman-was-treated-at-veterans-facility-before-he-killed-three-hostages-there-officials-say,64


In [11]:
df.drop(df.loc[df['url'].str.contains('https://www.straitstimes.com/')==False].index, inplace=True)

In [12]:
print(f'Total Number of URLS: {df.shape[0]}')

Total Number of URLS: 500597


#### Generate headlines and categories from URLs

In [8]:
def clean_headline(url):
    '''Generate headlines from URL
    '''
    try:
        headline = url.split('/')[-1]
        headline = headline.split('-')
        # Caps first letter of every word
        headline = [ w[0].upper() + w[1:] if len(w) > 0 else w for w in headline ] 
        headline = ' '.join(headline)
        return headline
    
    except Exception as e:
        print(url, e)
        return url

In [9]:
def clean_category(url):
    '''Generate categories from URL
    '''
    try:        
        category = url.replace('https://www.straitstimes.com/','').split('/')[:-1]
        # Caps first letter of every word
        category = [ w[0].upper() + w[1:] if len(w) > 0 else w for w in category ] 
        category = '/'.join(category)
        return category
    
    except Exception as e:
        print(url, e)
        return url

#### Test functions on one sample

In [14]:
sample = df['url'].sample(1).iloc[0]
print(f'URL: {sample}')
print(f'Headline: {clean_headline(sample)}')
print(f'Category: {clean_category(sample)}')

URL: https://www.straitstimes.com/opinion/asean-seize-initiative-to-act-now-on-n-e-asia-south-asia
Headline: Asean Seize Initiative To Act Now On N E Asia South Asia
Category: Opinion


#### Generate headlines for all URLs

In [15]:
df['Headline'] = df['url'].apply(lambda x: clean_headline(x))

In [16]:
df['Headline'].sample(10)

469850    Tiananmen Square Protests Made The Government More Responsive To Peoples Needs
57771                                                                       Style News 3
349944            Hundreds Of Thousands Evacuated In Japan As Historic Rain Falls 2 Dead
191911             Bush Daughters Advise Malia And Sasha Obama On Life After White House
269958                                            Fuzzy Pains And Joys Of Being A Family
128305                              Security Forces On Alert As Filipinos Pick President
157370                          China Launches Jet Engine Conglomerate In Aerospace Push
69350                                                       Whats News September 21 2015
152057                              30 Years On Control Shows Janet Jackson Is Essential
209288              Dutch Mayor Gives Go Ahead To Pro Turkish Rally Amid Row With Turkey
Name: Headline, dtype: object

#### Check and remove duplicates

In [18]:
df['Headline'].duplicated().value_counts()

False    497113
True       3484
Name: Headline, dtype: int64

In [19]:
df.drop(df['Headline'].loc[df['Headline'].duplicated()].index, inplace=True)

#### Generate categories for all URLs

In [20]:
df['Category'] = df['url'].apply(lambda x: clean_category(x))

In [21]:
df['Category'].sample(10)

467837                  Asia/Se-asia
477334    Business/Companies-markets
13992                          World
104907                  World/Europe
484278                     Singapore
61182            Singapore/Education
41745                          World
24497                          World
478875                  World/Africa
195347                  Asia/Se-asia
Name: Category, dtype: object

#### Create word count and character count features

In [24]:
def count_words(text):
    # Count words in headlines
    return len(text.split(' '))

def count_chars(text):
    # Count characters in headline 
    # Includes whitespace
    return len(text)

df['Wordcount'] = df['Headline'].apply(lambda x: count_words(x))
df['Charcount'] = df['Headline'].apply(lambda x: count_chars(x))

#### Inspect 95th percentile for word count

In [26]:
df['Headline'].loc[df['Wordcount'] >= df['Wordcount'].quantile(0.95)].sample(10)

15965           Government To Unveil Scheme To Help About 170000 Low Income Households In Digital Tv
105004           Sias Stake In Tiger Crosses 90 Mark Rest Of Shareholders Have Till Feb 19 To Accept
266410               We Have A Problem With Our Neighbours Qatar Emir Tells Trump Who Predicts Quick
256101       Singaporean Consumers More Confident In First Six Months Of 2017 After 2016s Low Survey
33993           Bo Xilai Trial Bos Other Son Thanks China For Allowing Father To Speak Freely Report
242539    Netlink Nbn Trust Set To Be Biggest Ipo In Singapore In Six Years With Pricing At 81 Cents
335035                      Football Dont Cry For Me Says Injured Dani Alves Who Promises To Be Back
38610     Police Break Into House Arrest Teenagers After Two Hour Standoff Arising From Rent Dispute
390446                  Porn Star Stormy Daniels Is Ordered To Pay Trump 400000 In Fees After Losing
293969          Japan To Clamp Down On Suicide Sites And Stress More Proactive Support Afte

#### Inspect Headlines with the least words (5th percentile)

In [294]:
df['Headline'].loc[df['Wordcount'] <= df['Wordcount'].quantile(0.05)].sample(10)

64842               All The Single Seat Women
166766                  Market Highlights 180
117902                         On Facebook 68
247291                      Next 48 Hours 744
444581             Kicking His Way To Fitness
135291                    Alis Key Milestones
73361                       Next 48 Hours 106
294258           Osa Case How Events Unfolded
91243     Fitting Gift For Juronghealth Staff
197624               In Pictures Puglia Italy
Name: Headline, dtype: object

Inspecting headlines with the least words reveals many headlines that are in fact names of recurring news segments such as 'Next 48 Hours' and 'On Facebook'. These segments were not duplicates because they are appended with an increment version number at the end of the sub-headine. They also tend to be short in word count.

We will attempt to remove these recurring news segments from the dataset.

In [27]:
# Titles that end with number are likely repetitive sub headlines that represent
# Recurring news segments such as "Next 48 Hours" nad "Punchlines ..."
# Remove all these segment titles
df['last_word_digit'] = df['Headline'].apply(lambda x: x.split(' ')[-1].isdigit()) # True if last word is digit
df['Headline'].loc[(df['Wordcount'] <= df['Wordcount'].quantile(0.05)) & (df['last_word_digit'])].sample(10)

189109    Go Ahead Quote Me 460
390975       Next 48 Hours 1216
350582    Whats Next Jul 9 2018
148405          Music Charts 56
68254     Upcoming Sme Events 4
228494         Top 10 Movies 93
207052            Food Picks 36
156657    Go Ahead Quote Me 356
79189              Style News 9
307075             Hot Bods 120
Name: Headline, dtype: object

#### Drop the recurring news segments

In [29]:
df.drop(df['Headline'].loc[(df['Wordcount'] <= df['Wordcount'].quantile(0.05)) & (df['last_word_digit'])].index, inplace=True)

#### Remove Other Repetitive headlines

In [30]:
df.drop(df['Headline'].loc[df['Headline'].str.contains("Singapore Shares Open")].index[1:], inplace=True)
df.drop(df['Headline'].loc[df['Headline'].str.contains("The Straits Times News In A Minute ")].index[1:], inplace=True)
df.drop(df['Headline'].loc[df['Headline'].str.contains("Top Stories From The Straits Times ")].index[1:], inplace=True)

#### Shuffle rows

In [31]:
df = df.sample(frac=1).reset_index(drop=True) 

#### Export cleaned dataset for training

In [34]:
with open('../data/st_headlines.txt', 'w') as f:
    f.write(df['Headline'].to_csv(index=False, header=False))

#### Export mappings from text to id and vice versa

In [26]:
with open('../data/st_headlines.txt', 'r') as f:
    vocab = sorted(set(f.read()))

    with open('../data/st_char2idx.txt', 'w') as char2idx_f:
        char2idx = json.dumps({u:i for i, u in enumerate(vocab)})
        char2idx_f.write(char2idx)
    
    with open('../data/st_idx2char.txt', 'w') as idx2char_f:
        idx2char = json.dumps(vocab)
        idx2char_f.write(idx2char)

#### Export cleaned dataset for EDA

In [33]:
df[['Headline', 'Category', 'Wordcount', 'Charcount']].to_csv("../data/st_sitemap_clean.csv", index=False)