In [2]:
import pandas as pd
import bs4
#import selenium
import requests
import re

In [35]:
url = "http://localhost:8000/students.html"
page = requests.get(url).text
soup = bs4.BeautifulSoup(page)
soup

<html>
<head>
<title>Web scraping</title>
</head>
<body>
<div class="student-details">
<p class="gl-jan" data-gender="f" id="student-10"> ABC-1 </p>
<p class="gl-jan" data-gender="m" id="student-11"> ABC-2 </p>
<p class="gl-jan" data-gender="f" id="student-12"> ABC-3 </p>
<p class="gl-jan" data-gender="m" id="student-13"> ABC-4 </p>
</div>
</body>
</html>

In [36]:
print(soup.title)

<title>Web scraping</title>


In [37]:
soup.title.text

'Web scraping'

In [39]:
soup.findAll('p')

[<p class="gl-jan" data-gender="f" id="student-10"> ABC-1 </p>,
 <p class="gl-jan" data-gender="m" id="student-11"> ABC-2 </p>,
 <p class="gl-jan" data-gender="f" id="student-12"> ABC-3 </p>,
 <p class="gl-jan" data-gender="m" id="student-13"> ABC-4 </p>]

In [41]:
tags = soup.findAll('p')
df = pd.DataFrame()
for tag in tags:
    name = tag.text
    student_id = tag.get('id')
    gender = tag.get('data-gender')
    temp = {
        'name': name,
        'student_id': student_id,
        'gender': gender,
    }
    df = df.append(temp, ignore_index=True)
df

Unnamed: 0,gender,name,student_id
0,f,ABC-1,student-10
1,m,ABC-2,student-11
2,f,ABC-3,student-12
3,m,ABC-4,student-13


In [44]:
soup.findAll('p', {'class': 'gl-jan'})

[<p class="gl-jan" data-gender="f" id="student-10"> ABC-1 </p>,
 <p class="gl-jan" data-gender="m" id="student-11"> ABC-2 </p>,
 <p class="gl-jan" data-gender="f" id="student-12"> ABC-3 </p>,
 <p class="gl-jan" data-gender="m" id="student-13"> ABC-4 </p>]

## Scraping wikipedia paragraphs

In [3]:
url = 'https://en.wikipedia.org/wiki/Data_science'
page = requests.get(url).text
soup = bs4.BeautifulSoup(page)

In [49]:
soup.title.text

'Data science - Wikipedia'

In [58]:
para_tags = soup.findAll('p')

para_texts = []
for tag in para_tags:
    para_texts.append(tag.text)
article = ' '.join(para_texts)
article = article.replace('\n','')
len(article)

6864

In [65]:
tables = soup.findAll('table')

In [68]:
df_tables = pd.read_html(url)
df_tables[1]

Unnamed: 0,vteData,vteData.1
0,Augmentation Analysis Archaeology Cleansing Co...,Augmentation Analysis Archaeology Cleansing Co...


In [4]:
!pip install html5lib



In [5]:
url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory'
tables = pd.read_html(url, flavor='html5lib')

In [77]:
df_cases = tables[6]

In [78]:
df_cases.to_excel('corona-cases.xlsx', sheet_name='country-cases')

In [None]:
#!pip install xlsxwriter
#writer = pd.ExcelWriter('corona-cases.xlsx', engine='xlswriter')
#df_cases.to_excel(writer, sheet_name="country-cases")

### Amazon webpages

In [13]:
url = "https://www.amazon.in/s?k=headphones&rh=n%3A1389401031&ref=nb_sb_noss_2"
HEADERS = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

page = requests.get(url, headers=HEADERS).text
soup = bs4.BeautifulSoup(page)

In [14]:
soup.title

<title>Amazon.in : headphones</title>

### Columwise approach
Avoid this approach

In [21]:
title_tags = soup.findAll('span', {'class': 'a-size-medium a-color-base a-text-normal'})
titles = []
for tag in title_tags:
    titles.append(tag.text)
len(titles)

30

In [24]:
price_tags = soup.findAll('span', {'class': 'a-price-whole'})
prices = []
for tag in price_tags:
    prices.append(tag.text)
len(prices)

29

In [25]:
parent_tags = soup.findAll('div', {'class': 's-result-item'})
len(parent_tags)

35

### Row-wise approach

In [37]:
df_products = pd.DataFrame()
for parent in parent_tags:
    try:
        name = parent.find('span', {'class': 'a-size-medium a-color-base a-text-normal'}).text
    except:
        name = None
    try:
        price = parent.find('span', {'class': 'a-price-whole'}).text
    except:
        price = None
    
    try:
        image_src = parent.find('img', {'class': 's-image'}).get('src')
    except:
        image_src = None
    temp = {
        'name': name,
        'price': price,
        'image_src': image_src
    }
    df_products = df_products.append(temp, ignore_index=True)
df_products[['name', 'price']].dropna(how='all').head()

Unnamed: 0,name,price
1,"Infinity Glide 500 by Harman (JBL, HK, Infinit...",1349
2,Boult Audio ProBass Flex Over-Ear Wireless Blu...,1199
3,boAt Bassheads 900 On Ear Wired Headphones(Car...,799
4,boAt Rockerz 450 Bluetooth On-Ear Headphone wi...,1499
5,Zebronics Zeb-Thunder Wireless BT Headphone Co...,749


### Scraping information from an API

In [38]:
pip install newsapi-python

Note: you may need to restart the kernel to use updated packages.


In [39]:
from newsapi import NewsApiClient

In [61]:
key = '50ee14aa955d4930857e4e69477175bc'

### Query API directly

In [47]:
url = "https://newsapi.org/v2/top-headlines?country=in&apiKey=%s" % key
page = requests.get(url).json()

In [49]:
page.keys()

dict_keys(['status', 'totalResults', 'articles'])

In [60]:
df_articles = pd.DataFrame(page['articles'])
df_articles['source'] = df_articles['source'].apply(lambda v: v['name'])
df_articles.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,Livemint,Sneha,Compensation for Covid victims: SC directs Cen...,The Centre had earlier the apex court that it ...,https://www.livemint.com/news/india/compensati...,https://images.livemint.com/img/2021/06/30/600...,2021-06-30T05:28:04Z,The Supreme Court of India on Wednesday direct...
1,DNA India,DNA Web Team,DNA Explainer: How likely is an Asteroid-relat...,"June 30, 2021 marks the 113th anniversary of t...",https://www.dnaindia.com/explainer/report-dna-...,https://cdn.dnaindia.com/sites/default/files/s...,2021-06-30T05:25:00Z,
2,Google News,,"Market LIVE: Nifty reclaims 15,800, Sensex jum...",,https://news.google.com/__i/rss/rd/articles/CB...,,2021-06-30T05:20:37Z,
3,Hindustan Times,HT Entertainment Desk,Kangana Ranaut reacts to Taapsee Pannu calling...,Kangana Ranaut reacted to Taapsee Pannu callin...,https://www.hindustantimes.com/entertainment/b...,https://images.hindustantimes.com/img/2021/06/...,2021-06-30T05:13:08Z,Kangana Ranaut has reacted to Taapsee Pannus r...
4,The Times of India,PTI,It's official: Srihari Nataraj qualifies for T...,Tokyo Olympics News: Indian swimmer Srihari Na...,https://timesofindia.indiatimes.com/sports/tok...,"https://static.toiimg.com/thumb/msid-83976871,...",2021-06-30T05:02:00Z,.@srihari3529 becomes the 2nd Indian #swimmer ...


### Query API using external packages

In [65]:
newsapi = NewsApiClient(api_key=key)
news_data = newsapi.get_top_headlines(country='us')
df_articles = pd.DataFrame(news_data['articles'])
df_articles['source'] = df_articles['source'].apply(lambda v: v['name'])
df_articles.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,ESPN,Tim Bontemps,"Atlanta Hawks revive series hopes, cruise to G...",The Hawks' series outlook looked completely di...,https://www.espn.com/nba/story/_/id/31736225/a...,https://a3.espncdn.com/combiner/i?img=%2Fphoto...,2021-06-30T05:45:20Z,ATLANTA -- When Hawks superstar Trae Young was...
1,KOMO News,"Michelle Esteban, KOMO News Reporter",Some confused as King County health officials ...,Tuesday was a big milestone for King County. P...,https://komonews.com/news/local/some-confused-...,https://static-17.sinclairstoryline.com/resour...,2021-06-30T04:20:17Z,
2,Yahoo Entertainment,Reuters,UPDATE 1-Berkshire's Munger says China right t...,Berkshire Hathaway Inc Vice Chairman Charlie M...,https://finance.yahoo.com/news/1-berkshires-mu...,https://s.yimg.com/cv/apiv2/social/images/yaho...,2021-06-30T04:16:06Z,"(Recasts with quotes about Jack Ma, adds detai..."
3,Politico,JOE ANUTA,Election officials void latest New York mayora...,"Shortly after the results were released, repor...",https://www.politico.com/states/new-york/alban...,https://static.politico.com/53/71/a02cbdd346d1...,2021-06-30T03:31:43Z,Todays mistake by the Board of Elections was u...
4,Pitchfork,Madison Bloom,Watch Olivia Rodrigo’s Sour Prom Concert Film ...,"The film, featuring songs from Rodrigo’s new a...",https://pitchfork.com/news/watch-olivia-rodrig...,https://media.pitchfork.com/photos/60db53e71df...,2021-06-30T02:55:28Z,Olivia Rodrigos Sour Prom concert film debuted...


In [68]:
sources = newsapi.get_sources()['sources']
df_sources = pd.DataFrame(sources)
df_sources.head()

Unnamed: 0,id,name,description,url,category,language,country
0,abc-news,ABC News,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,general,en,us
1,abc-news-au,ABC News (AU),"Australia's most trusted source of local, nati...",http://www.abc.net.au/news,general,en,au
2,aftenposten,Aftenposten,Norges ledende nettavis med alltid oppdaterte ...,https://www.aftenposten.no,general,no,no
3,al-jazeera-english,Al Jazeera English,"News, analysis from the Middle East and worldw...",http://www.aljazeera.com,general,en,us
4,ansa,ANSA.it,"Agenzia ANSA: ultime notizie, foto, video e ap...",http://www.ansa.it,general,it,it


In [72]:
df_sources[df_sources['country']=='us']['category'].unique()

array(['general', 'technology', 'sports', 'business', 'entertainment',
       'health', 'science'], dtype=object)

In [74]:
news_data = newsapi.get_top_headlines(country='us',
                                     category='business')
news_data['articles'][0]

{'source': {'id': None, 'name': 'Yahoo Entertainment'},
 'author': 'Reuters',
 'title': "UPDATE 1-Berkshire's Munger says China right to clip Ma's wings - Yahoo Finance",
 'description': "Berkshire Hathaway Inc Vice Chairman Charlie Munger praised China's move to impose a sweeping restructuring on Jack Ma’s Ant Group, the fintech giant whose...",
 'url': 'https://finance.yahoo.com/news/1-berkshires-munger-says-china-041023295.html',
 'urlToImage': 'https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo-1200x1200.png',
 'publishedAt': '2021-06-30T04:16:06Z',
 'content': "(Recasts with quotes about Jack Ma, adds details from interview, changes date in dateline)\r\nJune 30 (Reuters) - Berkshire Hathaway Inc Vice Chairman Charlie Munger praised China's move to impose a sw… [+1958 chars]"}

In [75]:
df_articles = pd.DataFrame()

for page in range(1,4):
    all_articles = newsapi.get_everything(q='bitcoin',
                                      #sources='bbc-news,the-verge',
                                      #domains='bbc.co.uk,techcrunch.com',
                                      #from_param='2017-12-01',
                                      #to='2017-12-12',
                                      language='en',
                                      sort_by='relevancy',
                                      page=page)
    df_articles = df_articles.append(all_articles['articles'], ignore_index=True)
df_articles.shape

(60, 8)

In [76]:
df_articles

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': 'engadget', 'name': 'Engadget'}",https://www.engadget.com/about/editors/saqib-shah,El Salvador becomes the first country to appro...,El Salvador has voted to adopt Bitcoin as lega...,https://www.engadget.com/el-salvador-bitcoin-l...,https://s.yimg.com/os/creatr-uploaded-images/2...,2021-06-09T12:04:40Z,El Salvador's President Nayib Bukele has made ...
1,"{'id': None, 'name': 'Gizmodo.com'}",Matt Novak,El Salvador Becomes First Country to Recognize...,El Salvador has become the first country in th...,https://gizmodo.com/el-salvador-becomes-first-...,https://i.kinja-img.com/gawker-media/image/upl...,2021-06-09T10:00:00Z,El Salvador has become the first country in th...
2,"{'id': 'mashable', 'name': 'Mashable'}",Stan Schroeder,Elon Musk says Tesla will resume Bitcoin purch...,"It's all about clean energy, it seems. \nElon ...",https://mashable.com/article/tesla-bitcoin-pur...,https://mondrian.mashable.com/2021%252F06%252F...,2021-06-14T07:15:49Z,"It's all about clean energy, it seems. \r\nElo..."
3,"{'id': 'bbc-news', 'name': 'BBC News'}",https://www.facebook.com/bbcnews,Bitcoin: El Salvador makes cryptocurrency lega...,It is the first country in the world to make t...,https://www.bbc.co.uk/news/world-latin-america...,https://ichef.bbci.co.uk/news/1024/branded_new...,2021-06-09T08:27:58Z,image captionThe move means bitcoin will be ac...
4,"{'id': None, 'name': 'Gizmodo.com'}",Alyse Stanley,Miami's Bitcoin Conference May Be the Latest C...,"Several crypto fans that descended on Miami, F...",https://gizmodo.com/miamis-bitcoin-conference-...,https://i.kinja-img.com/gawker-media/image/upl...,2021-06-11T00:45:00Z,"Several crypto fans that descended on Miami, F..."
5,"{'id': 'techcrunch', 'name': 'TechCrunch'}",Lucas Matney,In search of a new crypto deity,"Hello friends, and welcome back to Week in Rev...",http://techcrunch.com/2021/06/05/in-search-of-...,https://techcrunch.com/wp-content/uploads/2019...,2021-06-05T19:20:23Z,"Hello friends, and welcome back to Week in Rev..."
6,"{'id': 'bbc-news', 'name': 'BBC News'}",https://www.facebook.com/bbcnews,Donald Trump calls Bitcoin 'a scam against the...,The controversial former US president thinks t...,https://www.bbc.co.uk/news/business-57392734,https://ichef.bbci.co.uk/news/1024/branded_new...,2021-06-08T01:12:59Z,"By Mary-Ann RussonBusiness reporter, BBC News\..."
7,"{'id': 'reuters', 'name': 'Reuters'}",Reuters,"El Salvador to keep dollar as legal tender, se...",El Salvador will not replace the U.S. dollar w...,https://www.reuters.com/business/el-salvador-k...,https://www.reuters.com/resizer/8_ZpjJnVb-UkP6...,2021-06-16T17:34:00Z,"SAN SALVADOR, June 16 (Reuters) - El Salvador ..."
8,"{'id': 'reuters', 'name': 'Reuters'}","Tom Arnold,Karin Strohecker","El Salvador bitcoin plan ""bulletproof"", presid...",El Salvador is determined to push ahead with m...,https://www.reuters.com/business/el-salvador-b...,https://www.reuters.com/resizer/-NVJNngENeVucF...,2021-06-23T13:58:00Z,"LONDON, June 23 (Reuters) - El Salvador is det..."
9,"{'id': 'reuters', 'name': 'Reuters'}","Tom Arnold, Karin Strohecker","El Salvador bitcoin plan 'bulletproof', presid...",El Salvador is determined to push ahead with m...,https://www.reuters.com/article/el-salvador-bi...,https://s1.reutersmedia.net/resources_v2/image...,2021-06-23T13:46:00Z,"LONDON, June 23 (Reuters) - El Salvador is det..."


### Scraping Twitter Data using Tweepy library

In [78]:
import tweepy

consumer_key = ''
consumer_secret = ''

access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

#api = tweepy.API(auth, wait_on_rate_limit=True)
api = tweepy.API(auth, wait_on_rate_limit=False)

### Types of end points or functions
- api.user_timeline()
- api.search()

In [84]:
modi_tweets = api.user_timeline('narendramodi', page=1, count=200)
len(modi_tweets)

21

In [87]:
#modi_tweets[1]._json

In [92]:
tweets = pd.DataFrame()
for tweet in modi_tweets:
    temp = {
        'created_at': tweet.created_at,
        'text': tweet.text,
        'likes': tweet.favorite_count,
        'retweets': tweet.retweet_count,
        'source': tweet.source
    }
    tweets = tweets.append(temp, ignore_index=True)
tweets.head()

Unnamed: 0,created_at,likes,retweets,source,text
0,2021-06-30 08:41:02,1944.0,455.0,Twitter for iPhone,GST has been a milestone in the economic lands...
1,2021-06-29 13:06:19,22846.0,3941.0,Twitter Web App,The 'Bharat Ratna Dr. Bhimrao Memorial and Cul...
2,2021-06-29 08:50:49,47912.0,6665.0,Twitter Web App,The last few days have witnessed stupendous pe...
3,2021-06-28 13:21:02,8718.0,2542.0,Twitter Web App,The measures will help to stimulate economic a...
4,2021-06-28 13:21:02,9468.0,2626.0,Twitter Web App,Further support has been announced for our sma...


In [93]:
tweets = pd.DataFrame()

for page in range(1,6):
    modi_tweets = api.user_timeline('narendramodi', page=page, count=200)
    
    for tweet in modi_tweets:
        temp = {
            'created_at': tweet.created_at,
            'text': tweet.text,
            'likes': tweet.favorite_count,
            'retweets': tweet.retweet_count,
            'source': tweet.source
        }
        tweets = tweets.append(temp, ignore_index=True)
tweets.shape

(1000, 5)

In [94]:
tweets.head()

Unnamed: 0,created_at,likes,retweets,source,text
0,2021-06-30 08:41:02,4532.0,984.0,Twitter for iPhone,GST has been a milestone in the economic lands...
1,2021-06-29 13:06:19,22915.0,3956.0,Twitter Web App,The 'Bharat Ratna Dr. Bhimrao Memorial and Cul...
2,2021-06-29 08:50:49,47956.0,6671.0,Twitter Web App,The last few days have witnessed stupendous pe...
3,2021-06-28 13:21:02,8726.0,2545.0,Twitter Web App,The measures will help to stimulate economic a...
4,2021-06-28 13:21:02,9476.0,2629.0,Twitter Web App,Further support has been announced for our sma...


### Searching for a specific hashtag

In [95]:
ds_tweets = api.search('#datascience', count=200)

In [99]:
#ds_tweets[0]._json

In [106]:
df_ds_tweets = pd.DataFrame()
for tweet in ds_tweets:
    temp = {
        'created_at': tweet.created_at,
        'text': tweet.text,
        'likes': tweet.favorite_count,
        'retweets': tweet.retweet_count,
        'user_name': tweet.user.name,
        'screen_name': tweet.user.screen_name,
        'user_description': tweet.user.description,
        'user_location': tweet.user.location,
    }
    df_ds_tweets = df_ds_tweets.append(temp, ignore_index=True)
df_ds_tweets.head()

Unnamed: 0,created_at,likes,retweets,screen_name,text,user_description,user_location,user_name
0,2021-06-30 08:55:21,0.0,0.0,machinelearnTec,p3rceive Announces Integration of Natural Lang...,Sharing all #latest #news about #ArtificialInt...,"New York, USA",Machine Learning
1,2021-06-30 08:55:20,0.0,3.0,Nocodepediaa,RT @ABlogiX: Conseil à tous les utilisateurs d...,Sharing your journey to no-code with the world...,,NoCodepedia
2,2021-06-30 08:55:20,0.0,10.0,ricardo_ik_ahau,RT @BlackSATANArmy: APK🗜️📰 \n\nAn APK is a com...,Entender la realidad social para tranformarla,Benito Juárez,Ricardo Vázquez
3,2021-06-30 08:55:16,0.0,5.0,ricardo_ik_ahau,RT @BorokinniQ: Which of this analytic categor...,Entender la realidad social para tranformarla,Benito Juárez,Ricardo Vázquez
4,2021-06-30 08:55:15,0.0,3.0,CoderNotesBot,RT @IainLJBrown: How this Hamilton County cour...,Promotes new developers to get support and lov...,Amazon Web Services,CodersNotes


In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer().fit(df_ds_tweets['text'])
dtm = vectorizer.transform(df_ds_tweets['text'])
vocab = vectorizer.get_feature_names()
df_dtm = pd.DataFrame(dtm.toarray(), columns=vocab )

In [115]:
#df_dtm['datascience']

In [118]:
df_ds_tweets = pd.DataFrame()


for tweet in tweepy.Cursor(api.search, '#datascience').items():
    temp = {
        'created_at': tweet.created_at,
        'text': tweet.text,
        'likes': tweet.favorite_count,
        'retweets': tweet.retweet_count,
        'user_name': tweet.user.name,
        'screen_name': tweet.user.screen_name,
        'user_description': tweet.user.description,
        'user_location': tweet.user.location,
    }
    df_ds_tweets = df_ds_tweets.append(temp, ignore_index=True) 

KeyboardInterrupt: 

In [119]:
df_ds_tweets.shape

(190, 8)