## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [27]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

In [28]:
#!pip install --upgrade pip

In [29]:
#Better
!pip install requests BeautifulSoup4 fire




In [30]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

In [31]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [32]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag='h2')

In [43]:
for i in res[100:]:
    res.remove(i)
    #print(res)
names_infl = []
handle_infl = []
for r in res:
    split_data = r.split('.',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].split(')',maxsplit=1)[0]
    names_infl.append(name)
    handle_infl.append(handle)
    



In [44]:
df_influencer_handle = pd.DataFrame(handle_infl, columns=["100 influencers handles"])
#df_influencer_handle.to_csv('C:/Users/HP/Desktop/CV, P.Statement and others/10 Academy/100_handles.csv', index=False, header=True)
#print(df_influencer_handle)

In [46]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url)

In [47]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
re_gov = get_elements(response, tag='blockquote')
names = []
handles = []
for r in re_gov:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    names.append(name)
    handles.append(handle)

nam_handle = f'{name}:{handle}'

df_gov_handle = pd.DataFrame(handles, columns=["Gov influencers handles"])
#df_gov_handle.to_csv('C:/Users/HP/Desktop/CV, P.Statement and others/10 Academy/gov_handles.csv', index=False, header=True)
#print(df_gov_handle)

In [48]:
fl_handles = handles + handle_infl
#final_handle = pd.DataFrame(fl_handles, columns=["combined_handles"])
#final_handle.to_csv (r'C:\Users\HP\Desktop\CV, P.Statement and others\10 Academy\final_handle.csv', index = False, header=True)
#print(final_handle)


In [49]:
import sys
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

import matplotlib.dates as mdates
import seaborn as sns
#sns.set()


# to view all columns
#pd.set_option("display.max.columns", None)

In [51]:
#!pip install tweepy
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

In [52]:
consumer_key = "secret_key"
consumer_secret = "secret_key"
access_token = "secret_key"
access_token_secret = "secret_key"

In [53]:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [54]:
#getting tweets
tweets = []
tweetCount=5
for i in fl_handles:
    try:
        results=api.user_timeline(id=i, count=tweetCount)
    except tweepy.TweepError as e:
                continue  
    for tweet in results:
        tweets.append(tweet.text)
#print(tweets)




In [55]:
# getting followers for 100 influencers
# Calling the get_user function with our parameters
followers = []
for i in fl_handles:
    try:
        results = api.get_user(id=i)
    except tweepy.TweepError as e:
        continue 
    followers.append(results.followers_count)
    #print(results.followers_count)
print(followers)
# followers dataframe
total_followers = pd.DataFrame(followers,columns=["Number of followers"])
#total_followers.to_csv('C:/Users/HP/Desktop/CV, P.Statement and others/10 Academy/gov_follower.csv')
print(total_followers.head())




[11328, 39326, 192513, 126, 1599745, 7176, 546690, 2934, 66270, 1104272, 1984608, 424524, 2560, 371220, 4028, 1814142, 3272, 107410, 18662, 341079, 173650, 30174, 256402, 3283, 833187, 10776, 1508892, 738, 8482, 229001, 31814, 116331, 3272769, 1376009, 14937, 1132, 25704, 31288, 101296, 68076, 21964, 31293, 22798, 8, 50994, 28464, 439436, 52895, 46778, 11927, 24867, 49701, 28482, 117, 17443, 93967, 18239, 21364, 62318, 18498, 19996, 7489, 6, 215400, 232830, 98725, 106165, 70751, 17121, 249971, 48966, 105248, 58235, 80355, 487, 81639, 217290, 192561, 24702, 20448, 191425, 26435, 165783, 1002824, 48253, 50102, 18647, 240623, 105656, 88513, 84540, 1078900, 56242, 127369, 143193, 285562, 131, 215985, 220934, 59206, 29950, 51840, 151986, 114524, 673511, 56879, 541627, 49251, 69, 35231, 938216, 572314, 105685, 183464, 7, 1755677, 1042509, 1160270, 1416158, 200273, 1085310, 1164432, 1060365, 1443409, 3130248, 3578583, 18, 1974678, 10809317]
   Number of followers
0                11328
1     

In [57]:
# getting no of likes for gov influencers
likes = []
for i in fl_handles:
    try:
        results = api.get_user(id=i)
    except tweepy.TweepError as e:
        continue
    likes.append(results.favourites_count)
    #print(results.followers_count)
#print(likes)
total_like = pd.DataFrame(likes,columns=["Number of likes"])
#total_like.to_csv('C:/Users/HP/Desktop/CV, P.Statement and others/10 Academy/gov_likes.csv')
print(total_like.head())



   Number of likes
0               61
1              915
2              268
3               12
4               63


In [59]:
# getting no of following for 100 influencers
following = []
for i in fl_handles:
    try:
        results = api.get_user(id=i)
    except tweepy.TweepError as e:
        continue
    following.append(results.friends_count)

    
#print(following)
gov_following = pd.DataFrame(following,columns=["Number of following"])
#gov_following.to_csv('C:/Users/HP/Desktop/CV, P.Statement and others/10 Academy/gov_followings.csv')
print(gov_following.head())



   Number of following
0                   82
1                   26
2                   55
3                  224
4                   14


In [60]:
#getting retweets
no_of_retweets = []
for id in fl_handles:
    try:
        tweets = tweepy.Cursor(api.user_timeline, id=i).items()
        for tweet in tweets:
            no_of_retweets.append(tweet.retweet_count)
    except tweepy.TweepError as e:
        continue
#print(no_of_retweets)

#retweets dataframe
gov_retweets = pd.DataFrame(no_of_retweets, columns=["No of retweets"])
#gov_retweets.to_csv (r'C:\Users\HP\Desktop\CV, P.Statement and others\10 Academy\gov_retweet.csv', index = False, header=True)
print(gov_retweets.head())


   No of retweets
0            3202
1            2772
2           93174
3            6036
4            1658


In [61]:
#getting no of tweets shared
no_tweets_shared = []
for i in fl_handles:
    try:
        results = api.get_user(id=i)
    except tweepy.TweepError as e:
        continue
    no_tweets_shared.append(results.statuses_count)
    
print(no_tweets_shared)

#tweet shared dataframe
gov_statuses = pd.DataFrame(no_tweets_shared, columns=["No of statuses"])
#gov_statuses.to_csv (r'C:\Users\HP\Desktop\CV, P.Statement and others\10 Academy\gov_statuses.csv', index = False, header=True)
print(gov_statuses.head())

[1676, 4030, 1087, 125, 18881, 838, 628, 1065, 4720, 9052, 2862, 599, 209, 654, 836, 6645, 732, 40682, 753, 1650, 1738, 5649, 4496, 715, 7139, 1442, 7098, 21, 828, 11168, 30, 349, 4734, 2796, 83, 44, 3770, 16869, 142234, 27325, 19439, 6864, 1638, 1, 21549, 3892, 81209, 16890, 23988, 2166, 2523, 2814, 35271, 23, 4983, 13508, 88730, 42971, 26785, 15627, 9575, 15410, 0, 22996, 7579, 17748, 101356, 4986, 12766, 15755, 8771, 45455, 7187, 35783, 35, 18373, 33461, 14516, 8767, 3906, 48601, 32731, 15893, 265302, 16251, 11100, 8562, 91479, 30378, 23204, 137304, 35978, 10665, 8895, 38395, 53788, 117, 9294, 23704, 13353, 3118, 14757, 3243, 11275, 27253, 14684, 9266, 6476, 1, 21728, 50980, 23587, 27798, 15681, 6, 4555, 20000, 88551, 56044, 45134, 5688, 62843, 144320, 72335, 37220, 322930, 19, 31626, 11186]
   No of statuses
0            1676
1            4030
2            1087
3             125
4           18881


In [62]:
#mention for gov influencers
count = []
for x in range(0, len(fl_handles)):
    name = fl_handles[x]
    mentions_count = []
    try:
       for status in tweepy.Cursor(api.user_timeline, id=name).items():
         entities = status.entities
         if "user_mentions" in entities:
            for ent in entities["user_mentions"]:
              if ent is not None:
                if "screen_name" in ent:
                  name = ent["screen_name"]
                  if name is not None:
                    mentions_count.append(name)
    except tweepy.TweepError as e:
        continue
    count.append(len(mentions_count))
infl_mention = pd.DataFrame(count, columns=["gov mentions"])
#infl_mention.to_csv (r'C:\Users\HP\Desktop\CV, P.Statement and others\10 Academy\infl_mentions.csv', index = False, header=True)
print(infl_mention.head())

   gov mentions
0           117
1          1695
2           251
3            32
4          1089


In [None]:
def get_hashtags(fl_handles):
    cols = ['id', 'name', 'screen_name', 'hashtags']
    # dataframe that would be returned at the end
    df = pd.DataFrame(columns=cols)
    handle_data = []
    if len(fl_handle) > 0: 
        for handle in fl_handles:
            value_list = {}
            print("Getting hashtags for " + handle)
            # this helps avoid Tweepy errors like suspended users or user not found errors
            try:
                item = api.get_user(handle)
            except tweepy.TweepError as e:
                continue
            value_list['id'] = item.id_str
            value_list['name'] = item.name
            value_list['screen_name'] = item.screen_name
            #get average daily tweets
            no_tweets = item.statuses_count
            account_created_date = item.created_at
            delta = datetime.utcnow() - account_created_date
            account_age_days = delta.days
            hashtags = set()
            hash_dic = {}
            tweet_count = 0
            end_date = datetime.utcnow() - timedelta(days=30)
            for status in tweepy.Cursor(api.user_timeline, id=twitter_handle).items():
                if hasattr(status, "entities"):
                    entities = status.entities
                # get hashtags
                if "hashtags" in entities:
                    for ent in entities["hashtags"]:
                        if ent is not None:
                            if "text" in ent:
                                hashtag = ent["text"]
                                if hashtag is not None:
                                    if hashtag in hashtags:
                                        hash_dic[hashtag]+=1
                                    else:
                                        hashtags.add(hashtag)
                                        hash_dic[hashtag] = 1
                value_list['hashtags'] = hash_dic
            try:
                #escape handles with no hashtags
                df = df.append(pd.DataFrame(value_list))
                # the code snippet below before the return is used to save to file due to constant connections loss
                new_df = df.reset_index().rename(columns={'hashtags':'hashtags_count','index':'hashtags'})
            except tweepy.TweepError as e:
                continue
    return df.reset_index().rename(columns={'hashtags':'hashtags_count','index':'hashtags'})

### Popularity reach
### Reach Score
### Relevance Score

In [65]:
#popularity reach = retweets + likes
popularity = pd.concat([gov_retweets,total_like], axis=1)
#print(popularity)
popularity["Popularity_score"] = popularity["No of retweets"]+popularity["Number of likes"]
print(popularity)





       No of retweets  Number of likes  Popularity_score
0                3202             61.0            3263.0
1                2772            915.0            3687.0
2               93174            268.0           93442.0
3                6036             12.0            6048.0
4                1658             63.0            1721.0
...               ...              ...               ...
14939              73              NaN               NaN
14940               3              NaN               NaN
14941             670              NaN               NaN
14942              70              NaN               NaN
14943             112              NaN               NaN

[14944 rows x 3 columns]


In [63]:
#Reach Score = followers - following
reach = pd.concat([total_followers,gov_following], axis=1)
print(reach.head())
reach['reach_score']= reach["Number of followers"] - reach["Number of following"]
print(reach.head()) 


   Number of followers  Number of following
0                11328                   82
1                39326                   26
2               192513                   55
3                  126                  224
4              1599745                   14
   Number of followers  Number of following  reach_score
0                11328                   82        11246
1                39326                   26        39300
2               192513                   55       192458
3                  126                  224          -98
4              1599745                   14      1599731


## Web scrapping using bash script
If the web site has a quite simple HTML, you can easily use curl to perform the request and then extract the needed values using bash commands grep, cut , sed, ..

This tutorial is adapted from [this](https://medium.com/@LiliSousa/web-scraping-with-bash-690e4ee7f98d) medium article

In [None]:
%%bash 

# curl the page and save content to tmp_file
#url = "https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa"
#curl -X GET $url -o tmp_file


#!/bin/bash

# write headers to CSV file
echo "Name, twitter_id" >> extractData.csv
n="1"
while [ $n -lt 2 ]
do
  
  #get title
  title=$(cat tmp_file | grep "class=\"twitter-tweet\"" | cut -d ';' -f1 )
  echo $title
  #get author
  #twitter_id=$(cat tmp_file |grep -A1 "class=\"css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0\"" | tail -1)

  #echo "$title, $twitter_id" >> extractData.csv
  #echo "$title, $twitter_id"
    
  n=$[$n+1]

done