In [45]:
import pandas as pd
from bs4 import BeautifulSoup

In [52]:
import os
import sys

# get file names of files in data. __file__ doesn't work in ipynb
data_dir = os.path.join(os.getcwd(), 'data')
data_files = os.listdir(data_dir)

In [53]:
data_files

['al.csv', 'jpost_news.csv', 'nyt.csv', 'jpost_conflict.csv', 'bbc.csv']

In [54]:
#import data into pandas dataframes from data folder for each file in folder
data = {}
for file in data_files:
    data[file] = pd.read_csv(os.path.join(data_dir, file))

In [55]:
data.keys()

dict_keys(['al.csv', 'jpost_news.csv', 'nyt.csv', 'jpost_conflict.csv', 'bbc.csv'])

### Al Jazeera

In [35]:
#remove all rows where in_title is null or article_text is null
data['al.csv'] = data['al.csv'].dropna(subset=['link', 'article_text'])

In [38]:
# join the text from article_text based off in_title
alj_joined_articles = data['al.csv'].groupby('link')['article_text'].apply(' '.join).reset_index()

In [39]:
alj_joined_articles

Unnamed: 0,link,article_text
0,"50,000 Gaza children require urgent treatment ...",The UN agency for Palestinian refugees (UNRWA)...
1,8 Israeli soldiers killed in southern Gaza amb...,Hamas fighters killed eight Israeli soldiers t...
2,A Eurovision like no other: Israel’s war on Ga...,"Stockholm, Sweden - Malmo is rarely the centre..."
3,A House in Jerusalem holds the memories and gr...,Palestinian film director Muayad Alayan has ba...
4,A Jordanian flight to airdrop aid over a Gaza ...,"Amman, Jordan/sky above Gaza - A hulking milit..."
...,...,...
1090,‘We won’t stop’: The 24 hours Columbia etched ...,"New York, United States — At about 10pm on Mon..."
1091,‘Why is FIFA silent?’: Outraged Palestinians c...,"Palestinian football players, officials and fa..."
1092,‘World cannot afford Lebanon becoming another ...,United Nations Secretary-General Antonio Guter...
1093,‘World must not turn its back’ on Gaza as Isra...,The live page is now closed. You can continue ...


### Jerusalem Post

In [86]:
data["jpost_conflict.csv"]

Unnamed: 0,PublishDate,Title,Body,CategoryName
0,2024-06-23 18:22:53.147,"IDF, security forces rescue gazelles and confi...",<p>IDF soldiers arrested wanted individuals an...,Arab israeli conflict
1,2024-06-22 09:07:01.303,For second time this week: Local residents of ...,<p>An Israeli citizen in his 60s who entered <...,Arab israeli conflict
2,2024-06-21 10:46:41.027,Armenia unilaterally recognizes Palestinian st...,"<p><a href=""https://www.jpost.com/opinion/arti...",Arab israeli conflict
3,2024-06-20 17:10:36.087,Commemorating ‘Sayfo’: The untold genocide of ...,"<p>Every year on June 15, <a href=""https://www...",Arab israeli conflict
4,2024-06-20 10:07:39.147,Hezbollah war alert level rising: Is the IDF r...,"<p>Amos Hochstein, the envoy and mediator of t...",Arab israeli conflict
...,...,...,...,...
1995,2020-11-22 15:29:22.203,Gaza@@@s health system days from being overwhe...,<div>GAZA - A sharp rise in coronavirus infect...,Arab israeli conflict
1996,2020-11-22 14:31:29.173,Iran vows to crush any Israeli attempt to hit ...,"Iran on Sunday vowed to defeat any <a href=""ht...",Arab israeli conflict
1997,2020-11-22 21:17:40.527,"It wasn@@@t lightning silly, the rockets were ...","Two <a href=""https://www.jpost.com/breaking-ne...",Arab israeli conflict
1998,2020-11-22 18:57:52.497,Is there a new phase for Iran in Syria?,"Last week, a new round of Israeli <a href=""htt...",Arab israeli conflict


In [79]:
#remove all rows where Title or Body is null
jpost_body = data['jpost_conflict.csv'].dropna(subset=['Title', 'Body'])

In [80]:
#run beatiful soup on the body of the articles
jpost_body = jpost_body["Body"].apply(lambda x: BeautifulSoup(x, 'lxml').get_text(separator=' '))
jpost_articles = jpost_body.apply(lambda x: x.replace('\xa0', ''))
jpost_articles = pd.DataFrame(jpost_articles)
jpost_news = data['jpost_news.csv'].dropna(subset=['Title', 'Body'])

In [89]:
jpost_news_body = data["jpost_news.csv"].dropna(subset=['Title', 'Body'])
jpost_news_body = jpost_news_body["Body"].apply(lambda x: BeautifulSoup(x, 'lxml').get_text(separator=' '))
jpost_news_articles = jpost_news_body.apply(lambda x: x.replace('\xa0', ''))
jpost_news_articles = pd.DataFrame(jpost_news_articles)

In [92]:
jpost_articles = pd.concat([jpost_articles, jpost_news_articles], axis=0)

### BBC

In [110]:
data["bbc.csv"]

Unnamed: 0,Title,Link,Published_Date,Body,Newspaper
0,Donald Trump's Middle East peace plan: The Isr...,https://www.bbc.com/news/newsbeat-51294755,2020-01-29T11:21:42.000Z,"The ""deal of the century"". The ""last opportuni...",BBC
1,What does Trump’s Middle East plan say on key ...,https://www.bbc.com/news/world-middle-east-512...,2020-01-29T17:53:26.000Z,US President Donald Trump has unveiled his lon...,BBC
2,Israel attack: PM says Israel at war after 250...,https://www.bbc.com/news/world-middle-east-516...,2020-02-24T02:26:17.000Z,The Israeli military has carried out air strik...,BBC
3,Elderly Israelis beat isolation with tech lessons,https://www.bbc.com/news/blogs-news-from-elsew...,2020-04-06T14:20:23.000Z,Senior citizens in Israel stuck in their homes...,BBC
4,"Explainer: Israel, annexation and the West Bank",https://www.bbc.com/news/world-middle-east-527...,2020-06-16T17:27:43.000Z,Israeli Prime Minister Benjamin Netanyahu says...,BBC
...,...,...,...,...,...
1204,Israel confirms deaths of four more hostages i...,https://www.bbc.com/news/articles/c722r0p31x7o,2024-06-15T18:40:48.022Z,Eight Israeli soldiers were killed in a blast ...,BBC
1205,Gaza war: Dozens reported killed in Israeli st...,https://www.bbc.com/news/articles/c722r0p31x7o,2024-06-15T18:40:48.022Z,Eight Israeli soldiers were killed in a blast ...,BBC
1206,"Palestinian state recognised by Ireland, what ...",https://www.bbc.com/news/articles/c722r0p31x7o,2024-06-15T18:40:48.022Z,Eight Israeli soldiers were killed in a blast ...,BBC
1207,Military ‘pause’ on Gaza road inflames divisio...,https://www.bbc.com/news/articles/c2992985yk1o,2024-06-16T13:45:22.056Z,When is a ceasefire not a ceasefire? According...,BBC


In [116]:
bbc= data["bbc.csv"].dropna(subset=['Title', 'Body'])
#return body where body is unique
bbc_body = bbc["Body"].drop_duplicates()

In [117]:
bbc_body

0       The "deal of the century". The "last opportuni...
1       US President Donald Trump has unveiled his lon...
2       The Israeli military has carried out air strik...
3       Senior citizens in Israel stuck in their homes...
4       Israeli Prime Minister Benjamin Netanyahu says...
                              ...                        
1202    The UN children's agency Unicef has told the B...
1203    The US has imposed sanctions on an Israeli gro...
1204    Eight Israeli soldiers were killed in a blast ...
1207    When is a ceasefire not a ceasefire? According...
1208    Israel’s Prime Minister Benjamin Netanyahu has...
Name: Body, Length: 1105, dtype: object

### New York Times

In [120]:
nyt_data = data["nyt.csv"].dropna(subset=['Title', 'Body Text'])

In [131]:
data["nyt.csv"][data["nyt.csv"]['Body Text'].isnull()]

Unnamed: 0,Title,Link,Published_Date,Abstract,Lead Paragraph,Body Text,Newspaper
164,"In House Hearing, Republicans Demand Disciplin...",https://www.nytimes.com/2024/05/23/us/house-he...,2024-05-23T20:30:25+0000,"Leaders of Northwestern, U.C.L.A. and Rutgers,...","House Republicans used words like “violence,” ...",,The New York Times
328,Aid Group Calls Halts in Funding for UNRWA ‘Re...,https://www.nytimes.com/2024/01/30/world/middl...,2024-01-30T07:25:46+0000,The U.S. and some other donor countries said t...,Aid groups working in Gaza expressed dismay at...,,The New York Times
491,Inside the Pro-Palestinian Group Protesting Ac...,https://www.nytimes.com/2023/11/17/us/students...,2023-11-17T19:25:47+0000,"Students for Justice in Palestine, which was f...","After last month’s attack on Israel by Hamas, ...",,The New York Times
492,U.S. Urges Israel to Do More to Spare Civilian...,https://www.nytimes.com/2023/12/18/world/middl...,2023-12-18T23:12:48+0000,The American defense secretary reiterated “uns...,Top United States officials prodded Israel on ...,,The New York Times
493,How a 6-Second Video Turned a Campus Protest I...,https://www.nytimes.com/2023/12/18/nyregion/co...,2023-12-18T08:00:29+0000,A pro-Palestinian protest at the Cooper Union ...,"In the six-second video clip, pro-Palestinian ...",,The New York Times
...,...,...,...,...,...,...,...
1963,"With 102 Workers Killed, U.N. Agency in Gaza S...",https://www.nytimes.com/2023/12/05/us/politics...,2023-12-05T23:36:51+0000,While the president has backed Israel’s right ...,"For two months, President Biden has strongly b...",,The New York Times
1964,"Biden, Caught in Political Cross Currents, Nav...",https://www.nytimes.com/2023/10/09/world/middl...,2023-10-09T18:41:42+0000,"Israel mobilized 300,000 reservists amid signs...",Israel ordered a “complete siege” of the Gaza ...,,The New York Times
1965,Israel Softened Its Demands in Cease-Fire Nego...,https://www.nytimes.com/2023/11/19/world/middl...,2023-11-19T10:01:14+0000,"Since 2006, Gershon Baskin, an Israeli peace a...","For 17 years, on and off, two men maintained a...",,The New York Times
1966,Israel’s Army Is Ready to Invade Gaza. Its Div...,https://www.nytimes.com/2023/10/07/world/middl...,2023-10-07T22:00:21+0000,Palestinian militants from Gaza launched an ea...,Israel battled on Saturday to repel one of the...,,The New York Times


In [139]:
#drop if body text contains "We are having trouble retrieving the article content"
nyt_data = nyt_data[~nyt_data['Body Text'].str.contains("We are having trouble retrieving the article content")]

In [140]:
nyt_data

Unnamed: 0,Title,Link,Published_Date,Abstract,Lead Paragraph,Body Text,Newspaper
15,4 Israeli Soldiers Are Killed in Rafah in Appa...,https://www.nytimes.com/2024/06/11/world/middl...,2024-06-11T10:42:56+0000,Israel said the soldiers were killed when Hama...,The Israeli military said Tuesday that four Is...,Israel-HamasWar Advertisement Israel said the ...,The New York Times
21,Gantz quits Israel’s government in a dispute w...,https://www.nytimes.com/2024/06/10/world/middl...,2024-06-10T07:08:45+0000,The resignation exposed the divisions at the t...,"The Israeli politician Benny Gantz, a key memb...",Israel-HamasWar Advertisement Supported by The...,The New York Times
24,U.S. Intelligence Helped Israel Rescue Four Ho...,https://www.nytimes.com/2024/06/08/world/middl...,2024-06-08T15:56:08+0000,Intelligence collection and analysis teams fro...,The U.S. provided intelligence on the hostages...,Israel-HamasWar Advertisement Intelligence col...,The New York Times
28,Blinken to Visit Israel and Three Arab States ...,https://www.nytimes.com/2024/06/07/world/middl...,2024-06-07T18:34:11+0000,The U.S. secretary of state is expected to vis...,Secretary of State Antony J. Blinken plans to ...,Israel-HamasWar Advertisement The U.S. secreta...,The New York Times
30,"Israeli Hostage Rescue Fallout, and Far Right ...",https://www.nytimes.com/2024/06/10/podcasts/is...,2024-06-10T10:00:07+0000,"Plus, Apple’s Siri gets an A.I. makeover.",On Today’s Episode:,"For more audio journalism and storytelling,dow...",The New York Times
...,...,...,...,...,...,...,...
470,Settler Violence Against Palestinians in the W...,https://www.nytimes.com/2023/10/30/world/middl...,2023-10-30T22:17:20+0000,"More than 100 Palestinians have been killed, m...",Attacks on Palestinians in the occupied West B...,Israel-HamasWar Advertisement More than 100 Pa...,The New York Times
477,The New State of the War in Gaza,https://www.nytimes.com/2023/12/21/podcasts/th...,2023-12-21T11:00:03+0000,The accidental killing of three hostages by Is...,After the accidental killing of three hostages...,"For more audio journalism and storytelling,dow...",The New York Times
485,Protesters fill the streets in New York to sup...,https://www.nytimes.com/2023/10/28/nyregion/pa...,2023-10-29T01:55:09+0000,A large demonstration crossed the Brooklyn Bri...,Crowds of pro-Palestinian demonstrators packed...,Israel-HamasWar Advertisement A large demonstr...,The New York Times
488,"Biden’s Rating Dips on Gaza, and Marvel Drops ...",https://www.nytimes.com/2023/12/19/podcasts/bi...,2023-12-19T11:01:51+0000,Hear the news in five minutes.,The New York Times Audio app is home to journa...,"For more audio journalism and storytelling,dow...",The New York Times


There aren't that many nyt articles where the body actually exists, and even if they exist, the articles are behind a paywall. So I'm going to use the other three sources.