# Webscrapping NYtimes data

In [1]:
#import library
import requests
from bs4 import BeautifulSoup

#enter rss feed URL
url = "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"

resp = requests.get(url)

soup = BeautifulSoup(resp.content, features="xml")

In [2]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:nyt="http://www.nytimes.com/namespaces/rss/2.0">
 <channel>
  <title>
   NYT &gt; Top Stories
  </title>
  <link>
   https://www.nytimes.com
  </link>
  <atom:link href="https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" rel="self" type="application/rss+xml"/>
  <description/>
  <language>
   en-us
  </language>
  <copyright>
   Copyright 2020 The New York Times Company
  </copyright>
  <lastBuildDate>
   Sun, 08 Mar 2020 05:46:30 +0000
  </lastBuildDate>
  <pubDate>
   Sun, 08 Mar 2020 05:46:30 +0000
  </pubDate>
  <image>
   <title>
    NYT &gt; Top Stories
   </title>
   <url>
    https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png
   </url>
   <link>
    https://www.nytimes.com
   </link>
  </image>
  <item>
   <title>
    Italy Locks Down Much of the Country’s North Over the

In [3]:
items = soup.findAll('item')

In [4]:
print(items)

[<item>
<title>Italy Locks Down Much of the Country’s North Over the Coronavirus</title>
<link>https://www.nytimes.com/2020/03/07/world/europe/coronavirus-italy.html</link>
<guid isPermaLink="true">https://www.nytimes.com/2020/03/07/world/europe/coronavirus-italy.html</guid>
<atom:link href="https://www.nytimes.com/2020/03/07/world/europe/coronavirus-italy.html" rel="standout"/>
<description>The restrictions affect Milan and the regions that serve as Italy’s economic engine, and are the most sweeping measures outside China.</description>
<dc:creator>Jason Horowitz</dc:creator>
<pubDate>Sun, 08 Mar 2020 05:36:10 +0000</pubDate>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_geo">Italy</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_geo">Lombardy (Italy)</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_geo">Rome (Italy)</category>
<category domain="http://www.nytimes.com/namespaces/keywords/des">Coronavirus (2019-nC

In [5]:
len(items)

52

In [6]:
item = items[1]

In [7]:
item

<item>
<title>Italy Locks Down Northern Region, and U.S. Coronavirus Cases Pass 400</title>
<link>https://www.nytimes.com/2020/03/08/world/coronavirus-news.html</link>
<guid isPermaLink="true">https://www.nytimes.com/2020/03/08/world/coronavirus-news.html</guid>
<atom:link href="https://www.nytimes.com/2020/03/08/world/coronavirus-news.html" rel="standout"/>
<description>Italy announced a plan to restrict the movement of a fourth of its population. New York declared a state of emergency. Here’s the latest.</description>
<dc:creator>The New York Times</dc:creator>
<pubDate>Sun, 08 Mar 2020 05:19:54 +0000</pubDate>
<media:content height="151" medium="image" url="https://static01.nyt.com/images/2020/03/07/world/07virus-briefing-italy24-promo/07virus-briefing-italy24-moth.jpg" width="151"/>
<media:credit>Matteo Corner/EPA, via Shutterstock</media:credit>
<media:description>A store employee on Friday in Milan, the largest city in the Lombardy area of Italy. </media:description>
</item>

In [8]:
#declare empty var
news_items = []

In [9]:
#scraping HTML tags: Title, Description, Link and publication date
for item in items:
    news_item = {}
    news_item['title'] = item.title.text
    news_item['description'] = item.description.text
    news_item['link'] = item.link.text
    news_item['pubDate'] = item.pubDate.text
    news_items.append(news_item)

In [10]:
#use pandas to create dataframe and to create CSV
import pandas as pd
df = pd.DataFrame(news_items,columns=['title','description','link','pubDate'])

In [11]:
df.head()

Unnamed: 0,title,description,link,pubDate
0,Italy Locks Down Much of the Country’s North O...,The restrictions affect Milan and the regions ...,https://www.nytimes.com/2020/03/07/world/europ...,"Sun, 08 Mar 2020 05:36:10 +0000"
1,"Italy Locks Down Northern Region, and U.S. Cor...",Italy announced a plan to restrict the movemen...,https://www.nytimes.com/2020/03/08/world/coron...,"Sun, 08 Mar 2020 05:19:54 +0000"
2,"Inside Trump Administration, Debate Raged Over...",The administration’s response to the coronavir...,https://www.nytimes.com/2020/03/07/us/politics...,"Sun, 08 Mar 2020 05:36:04 +0000"
3,"China May Be Beating the Coronavirus, at a Pai...",Beijing says its heavy-handed measures are wor...,https://www.nytimes.com/2020/03/07/world/asia/...,"Sun, 08 Mar 2020 05:39:01 +0000"
4,Anyone Who Wants a Coronavirus Test Can Have O...,Key officials in recent days have corrected or...,https://www.nytimes.com/2020/03/07/us/politics...,"Sun, 08 Mar 2020 01:20:12 +0000"


In [12]:
#the csv is created and saved in current working directory
df.to_csv('NYtimes.csv',index=False, encoding = 'utf-8')