In [1]:
import numpy as np
import pandas as pd 
import nltk 
import string

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 800)

In [3]:
df = pd.read_csv('subset_headlines.csv')
print(df.head())

                          org                  scraped_at                                         headline_1                                         headline_2                                         headline_3
0   https://www.economist.com  2024-11-28 05:39:57.350818  Elon Musk’s xAI goes after OpenAI, https://www...  Does Donald Trump have unlimited authority to ...  Peace in Lebanon is just a start, https://www....
1  https://www.lemonde.fr/en/  2024-11-28 05:39:55.247174  With her latest production, French theater gia...  Tens of thousands in Lebanon head home as Isra...  Biden administration tells Ukraine to lower co...
2         https://www.bbc.com  2024-11-28 05:39:54.460652  Air raid alert across Ukraine as multiple expl...  The Lebanon ceasefire is a respite, not a solu...  Mexico leader responds to Trump claim she agre...
3   https://www.aljazeera.com  2024-11-28 05:39:53.667695  ‘Absolutely terrifying’: Israel pounds north G...  Dozens killed as armed groups attack Syrian mi

In [4]:
## Safely split
def safe_split(text):
    try:
        return text.split(' ')
    except:
        print(text)
        return [None, None]

In [5]:
df[['headline_1_title', 'headline_1_url']] = df['headline_1'].str.split('https://', n=1, expand=True)
df[['headline_2_title', 'headline_2_url']] = df['headline_2'].str.split('https://', n=1, expand=True)
df[['headline_3_title', 'headline_3_url']] = df['headline_3'].str.split('https://', n=1, expand=True)

df.drop(['headline_1', 'headline_2', 'headline_3'], axis=1, inplace=True)

print(df.head(10))

                          org                  scraped_at                                   headline_1_title                                     headline_1_url                                   headline_2_title                                     headline_2_url                                   headline_3_title                                     headline_3_url
0   https://www.economist.com  2024-11-28 05:39:57.350818                Elon Musk’s xAI goes after OpenAI,   www.economist.com/business/2024/11/27/elon-mus...  Does Donald Trump have unlimited authority to ...  www.economist.com/united-states/2024/11/27/doe...                 Peace in Lebanon is just a start,   www.economist.com/leaders/2024/11/27/peace-in-...
1  https://www.lemonde.fr/en/  2024-11-28 05:39:55.247174  With her latest production, French theater gia...  www.lemonde.fr/en/culture/article/2024/11/28/w...  Tens of thousands in Lebanon head home as Isra...  www.lemonde.fr/en/international/article/2024/1...  Biden adm

In [6]:
for i in ['headline_1_url', 'headline_2_url', 'headline_3_url']:
    df[i] = 'https://' + df[i].astype(str)
print(df.head(5))

                          org                  scraped_at                                   headline_1_title                                     headline_1_url                                   headline_2_title                                     headline_2_url                                   headline_3_title                                     headline_3_url
0   https://www.economist.com  2024-11-28 05:39:57.350818                Elon Musk’s xAI goes after OpenAI,   https://www.economist.com/business/2024/11/27/...  Does Donald Trump have unlimited authority to ...  https://www.economist.com/united-states/2024/1...                 Peace in Lebanon is just a start,   https://www.economist.com/leaders/2024/11/27/p...
1  https://www.lemonde.fr/en/  2024-11-28 05:39:55.247174  With her latest production, French theater gia...  https://www.lemonde.fr/en/culture/article/2024...  Tens of thousands in Lebanon head home as Isra...  https://www.lemonde.fr/en/international/articl...  Biden adm

In [7]:
melted = pd.melt(df, id_vars=['org', 'scraped_at'], value_vars=['headline_1_title', 'headline_2_title', 'headline_3_title'], var_name='headline_type', value_name='headline')
print(melted.head(10))

                          org                  scraped_at     headline_type                                           headline
0   https://www.economist.com  2024-11-28 05:39:57.350818  headline_1_title                Elon Musk’s xAI goes after OpenAI, 
1  https://www.lemonde.fr/en/  2024-11-28 05:39:55.247174  headline_1_title  With her latest production, French theater gia...
2         https://www.bbc.com  2024-11-28 05:39:54.460652  headline_1_title  Air raid alert across Ukraine as multiple expl...
3   https://www.aljazeera.com  2024-11-28 05:39:53.667695  headline_1_title  ‘Absolutely terrifying’: Israel pounds north G...
4   https://www.economist.com  2024-11-27 05:39:19.865585  headline_1_title    Trump wastes no time in reigniting trade wars, 
5  https://www.lemonde.fr/en/  2024-11-27 05:39:18.869199  headline_1_title  Israel and Lebanon's Hezbollah ceasefire takes...
6         https://www.bbc.com  2024-11-27 05:39:18.127691  headline_1_title  What we know about Israel-Hezbolla

In [8]:
print('melted')
print(melted.shape)
print('df')
print(df.shape)

melted
(300, 4)
df
(100, 8)


In [9]:
melted_urls = pd.melt(df, id_vars=['org', 'scraped_at'], value_vars=['headline_1_url', 'headline_2_url', 'headline_3_url'], var_name='headline_url', value_name='url')
print(melted_urls.head(10))
melted['url'] = melted_urls['url']
final_df = melted[['headline', 'url', 'org', 'scraped_at', 'headline_type']]

print(final_df.head(20))
print(final_df.shape)

                          org                  scraped_at    headline_url                                                url
0   https://www.economist.com  2024-11-28 05:39:57.350818  headline_1_url  https://www.economist.com/business/2024/11/27/...
1  https://www.lemonde.fr/en/  2024-11-28 05:39:55.247174  headline_1_url  https://www.lemonde.fr/en/culture/article/2024...
2         https://www.bbc.com  2024-11-28 05:39:54.460652  headline_1_url     https://www.bbc.com/news/articles/cwy15lp21l3o
3   https://www.aljazeera.com  2024-11-28 05:39:53.667695  headline_1_url  https://www.aljazeera.com/news/liveblog/2024/1...
4   https://www.economist.com  2024-11-27 05:39:19.865585  headline_1_url  https://www.economist.com/finance-and-economic...
5  https://www.lemonde.fr/en/  2024-11-27 05:39:18.869199  headline_1_url  https://www.lemonde.fr/en/international/articl...
6         https://www.bbc.com  2024-11-27 05:39:18.127691  headline_1_url     https://www.bbc.com/news/articles/cx2d3gj9ewxo


In [10]:
final_df['headline_type'] = final_df['headline_type'].str.extract(r'headline_(\d+)_title')
print(final_df.head(5))

                                            headline                                                url                         org                  scraped_at headline_type
0                Elon Musk’s xAI goes after OpenAI,   https://www.economist.com/business/2024/11/27/...   https://www.economist.com  2024-11-28 05:39:57.350818             1
1  With her latest production, French theater gia...  https://www.lemonde.fr/en/culture/article/2024...  https://www.lemonde.fr/en/  2024-11-28 05:39:55.247174             1
2  Air raid alert across Ukraine as multiple expl...     https://www.bbc.com/news/articles/cwy15lp21l3o         https://www.bbc.com  2024-11-28 05:39:54.460652             1
3  ‘Absolutely terrifying’: Israel pounds north G...  https://www.aljazeera.com/news/liveblog/2024/1...   https://www.aljazeera.com  2024-11-28 05:39:53.667695             1
4    Trump wastes no time in reigniting trade wars,   https://www.economist.com/finance-and-economic...   https://www.economist.co

In [11]:
final_df['org'] = final_df['org'].str.extract(r'https?:\/\/(?:www\.)?([a-zA-Z0-9-]+)')
print(final_df.head(5))

                                            headline                                                url        org                  scraped_at headline_type
0                Elon Musk’s xAI goes after OpenAI,   https://www.economist.com/business/2024/11/27/...  economist  2024-11-28 05:39:57.350818             1
1  With her latest production, French theater gia...  https://www.lemonde.fr/en/culture/article/2024...    lemonde  2024-11-28 05:39:55.247174             1
2  Air raid alert across Ukraine as multiple expl...     https://www.bbc.com/news/articles/cwy15lp21l3o        bbc  2024-11-28 05:39:54.460652             1
3  ‘Absolutely terrifying’: Israel pounds north G...  https://www.aljazeera.com/news/liveblog/2024/1...  aljazeera  2024-11-28 05:39:53.667695             1
4    Trump wastes no time in reigniting trade wars,   https://www.economist.com/finance-and-economic...  economist  2024-11-27 05:39:19.865585             1


In [12]:
#final_df.to_dict('records')

In [19]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

def remove_punct(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

def remove_stopwords(text):
    print(text)
    words = word_tokenize(text)
    without_stopwords = "".join([w for w in words if w not in stopwords.words('english')])
    print(without_stopwords)
    return without_stopwords

sent = 'This is a sample sentence, showing off the stop words filtration.'
print(remove_stopwords(sent))

final_df['headline'] = final_df['headline'].apply(lambda x: remove_punct(x))
final_df['headline'] = final_df['headline'].apply(lambda x: remove_stopwords(x))
print(final_df.head(5))

This is a sample sentence, showing off the stop words filtration.
Thissamplesentence,showingstopwordsfiltration.
Thissamplesentence,showingstopwordsfiltration.
ElonMusk’xAIgoesOpenAI
ElonMusk’xAIgoesOpenAI
WithlatestproductionFrenchtheatergiantArianeMnouchkinewarpath
WithlatestproductionFrenchtheatergiantArianeMnouchkinewarpath
AirraidalertacrossUkrainemultipleexplosionsreported
AirraidalertacrossUkrainemultipleexplosionsreported
‘Absolutelyterrifying’IsraelpoundsnorthGazaLebanontruceholds
‘Absolutelyterrifying’IsraelpoundsnorthGazaLebanontruceholds
Trumpwastestimereignitingtradewars
Trumpwastestimereignitingtradewars
IsraelLebanonsHezbollahceasefiretakeseffect
IsraelLebanonsHezbollahceasefiretakeseffect
WhatknowIsraelHezbollahceasefiredeal
WhatknowIsraelHezbollahceasefiredeal
‘Fragiletruce’LebanonpeacehopesIsraelHezbollahceasefirestarts
‘Fragiletruce’LebanonpeacehopesIsraelHezbollahceasefirestarts
Nobodyknowsultraprocessedfoodsbad
Nobodyknowsultraprocessedfoodsbad
TrumpsavedAmericanvo

[nltk_data] Downloading package stopwords to /home/rtseng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rtseng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
