In [11]:
import collecting_news as news
import pandas as pd

In [None]:
# gets headlines from selected news sources as csv
news.get_headlines('reuters')
news.get_headlines('buzzfeed')
news.get_headlines('entertainment-weekly')

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [13]:
# understanding, cleaning the dataset using reuters as the example
reuters = pd.read_csv('../data/reuters.csv')
buzzfeed = pd.read_csv('../data/buzzfeed.csv')
ew = pd.read_csv('../data/entertainment-weekly.csv')
reuters.head()

Unnamed: 0,author,title,description,url,urlToImage,publishedAt,content,source.id,source.name
0,Reuters,Germany's Kuehne examines offer for Signa's Ha...,German logistics entrepreneur Klaus-Michael Ku...,https://www.reuters.com/business/germanys-kueh...,https://www.reuters.com/resizer/-J5TEXoXkkAHtd...,2023-11-26T19:54:14Z,"FRANKFURT, Nov 26 (Reuters) - German logistics...",reuters,Reuters
1,Reuters,Shoppers click 'buy' as retailers slash prices...,Holiday shoppers in the U.S. are seeking out t...,https://www.reuters.com/business/shoppers-clic...,https://www.reuters.com/resizer/4vOemJyv1FmpNA...,2023-11-26T11:15:34Z,Nov 26 (Reuters) - Holiday shoppers in the U.S...,reuters,Reuters
2,Reuters,US Black Friday sales rise 2.5% -Mastercard Sp...,"Mastercard <a href=""https://www.reuters.com/ma...",https://www.reuters.com/business/retail-consum...,https://www.reuters.com/resizer/WpbZDguFey14nL...,2023-11-25T18:23:30Z,Nov 25 (Reuters) - Mastercard (MA.N) Spendingp...,reuters,Reuters
3,Reuters,X may lose up to $75M by year-end on advertise...,Elon Musk-owned social media company X could l...,https://www.reuters.com/technology/x-may-lose-...,https://www.reuters.com/resizer/Nbq-yd_5BSo_kH...,2023-11-25T14:07:10Z,Nov 24 (Reuters) - Elon Musk-owned social medi...,reuters,Reuters
4,Uditha Jayasinghe,Sri Lanka to OK Sinopec's $4.5 bln refinery pr...,Sri Lanka will likely approve on Monday a prop...,https://www.reuters.com/business/energy/sri-la...,https://www.reuters.com/resizer/mtv0deZMUYoKyr...,2023-11-25T13:45:45Z,"COLOMBO, Nov 25 (Reuters) - Sri Lanka will lik...",reuters,Reuters


In [14]:
# understanding data through the followin functions:
reuters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       100 non-null    object
 1   title        100 non-null    object
 2   description  100 non-null    object
 3   url          100 non-null    object
 4   urlToImage   100 non-null    object
 5   publishedAt  100 non-null    object
 6   content      100 non-null    object
 7   source.id    100 non-null    object
 8   source.name  100 non-null    object
dtypes: object(9)
memory usage: 7.2+ KB


In [15]:
reuters.describe(exclude=np.number).T

Unnamed: 0,count,unique,top,freq
author,100,51,Reuters,44
title,100,100,Germany's Kuehne examines offer for Signa's Ha...,1
description,100,100,German logistics entrepreneur Klaus-Michael Ku...,1
url,100,100,https://www.reuters.com/business/germanys-kueh...,1
urlToImage,100,97,https://www.reuters.com/pf/resources/images/re...,4
publishedAt,100,99,2023-11-18T15:05:12Z,2
content,100,100,"FRANKFURT, Nov 26 (Reuters) - German logistics...",1
source.id,100,1,reuters,100
source.name,100,1,Reuters,100


In [16]:
# dropping irrelvant columns
reuters = reuters.drop(columns=['author','description','url','urlToImage','publishedAt','content','source.name'])
buzzfeed = buzzfeed.drop(columns=['author','description','url','urlToImage','publishedAt','content','source.name'])
ew = ew.drop(columns=['author','description','url','urlToImage','publishedAt','content','source.name'])

In [35]:
# renaming columns
reuters = reuters.rename(columns={'title':'headline','source.id':'source'})
buzzfeed = buzzfeed.rename(columns={'title':'headline','source.id':'source'})
ew = ew.rename(columns={'title':'headline','source.id':'source'})

In [34]:
# checking for duplicates
print(reuters.duplicated().sum(),
buzzfeed.duplicated().sum(),
ew.duplicated().sum())


0 0 0


In [20]:
# adding sensational score of 0 as reuters is the baseline for non-sensational headlines
reuters['sensational'] = 0

In [37]:
# adding sensational score of 1 for buzzfeed, entertainment weekly for sensational headlines
buzzfeed['sensational'] = 1
ew['sensational'] = 1

In [25]:
# displaying max rows to manually remove non-english headlines
pd.set_option('display.max_rows', None)
reuters = reuters.drop([14,96],axis=0)

In [42]:
# simplifying source from entertainment-weekly to ew
ew['source'] = ew['source'].map({'entertainment-weekly':'ew'})

Unnamed: 0,headline,source,sensational
0,"Marty Krofft, 'King of Saturday Mornings,' die...",ew,1
1,Missing Irish filmmaker Ross McDonnell confirm...,ew,1
2,Why Daryl Hall is suing John Oates,ew,1
3,'Ahsoka' creator praises Hayden Christensen’s ...,ew,1
4,Cher's Macy's Thanksgiving Day Parade performa...,ew,1
5,Beyonce makes surprise appearance during Thank...,ew,1
6,Whoopi Goldberg reacts to Thanksgiving turkey ...,ew,1
7,Sean Penn praises Matthew Perry for speaking o...,ew,1
8,Daryl Hall filed a restraining order against J...,ew,1
9,Melissa Barrera dropped from 'Scream 7' after ...,ew,1


In [45]:
# combining 3 baseline sources into 1 dataframe
source_list = [reuters,buzzfeed,ew]
baseline = pd.concat(source_list,ignore_index=True)

In [46]:
pd.set_option('display.max_rows', 10)
baseline

Unnamed: 0,headline,source,sensational
0,Germany's Kuehne examines offer for Signa's Ha...,reuters,0
1,Shoppers click 'buy' as retailers slash prices...,reuters,0
2,US Black Friday sales rise 2.5% -Mastercard Sp...,reuters,0
3,X may lose up to $75M by year-end on advertise...,reuters,0
4,Sri Lanka to OK Sinopec's $4.5 bln refinery pr...,reuters,0
...,...,...,...
293,Matthew Perry stopped Chandler cheating storyl...,ew,1
294,Read Stephen King's 'Cujo' sequel in excerpt f...,ew,1
295,"Zac Efron, Jeremy Allen White on true story of...",ew,1
296,Black Panther stuntman and children killed in ...,ew,1


In [None]:
baseline.to_csv('baseline.csv',index=False)