# Web Scraping from Altmetric

This code aims to scrape the latest information for all News, Blogs, Tweets and summary page from Altmetric, and detect the language for all headlines and subtitles. Then, this code also extracted detailed content for top 10 mediasource of News. The specific steps include:

1) News and Blogs: Get all required information of news and blogs, detect their language and save them in separate data sets "Altmetric_Blogs.xlsx" and "Altmetric_News.xlsx". 

2) Twitter: Get the account handles (under a column called medialink) and headlines of all tweets, and delete our Twitter posts. Since tweets are always without any subtitles, I didn't add this column here. The final exported dataset is "Altmetric_Tweets.xlsx"

3) Fetch the Altmetric scores, citations, readers and demographic information, and merged them with three existing data sets. After concatenating all date frames (), this code exports the final dataset as "Altmetric_scrapedall.xlsx".

4) Scraped five of the top 10 news media sources for detailed content.

There is no raw data set. And after the above processing, this code will export 4 new google sheets "Altmetric_Blogs.xlsx", "Altmetric_News.xlsx" ("Altmetric_Tweets.xlsx", and "Altmetric_scrapedall.xlsx", which will be also located in the "Altmetric" folder.

In [None]:
pip install langdetect

In [None]:
pip install pycountry

In [None]:
# Import needed packages and connect to google drive
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from langdetect import detect
from langdetect import DetectorFactory
import pycountry
#from google.colab import drive
#drive.mount('/content/drive')
#%cd /content/drive/My Drive/Research/_Fiverr/Altmetric

## Blogs
### Step 1: Extract blog links, headlines and subtitles

In [None]:
urls = [
    'https://oxfordjournals.altmetric.com/details/72683542',
    'https://science.altmetric.com/details/60552876',
    'https://jamanetwork.altmetric.com/details/64368646',
    'https://science.altmetric.com/details/69584866',
    'https://annals.altmetric.com/details/56459321',
    'https://scienceadvances.altmetric.com/details/69530897',
    'https://nature.altmetric.com/details/63584063'
]

author_map = {
    'https://oxfordjournals.altmetric.com/details/72683542': 'Gangwisch et al.',
    'https://science.altmetric.com/details/60552876': 'Lee et al.',
    'https://jamanetwork.altmetric.com/details/64368646': 'Kim et al.',
    'https://science.altmetric.com/details/69584866': 'Mina et al.',
    'https://annals.altmetric.com/details/56459321': 'Hviid et al.',
    'https://scienceadvances.altmetric.com/details/69530897': 'Maxwell et al.',
    'https://nature.altmetric.com/details/63584063': 'Berzaghi et al.'
}

article_map = {
    'https://oxfordjournals.altmetric.com/details/72683542': 'Article 2',
    'https://science.altmetric.com/details/60552876': 'Article 3',
    'https://jamanetwork.altmetric.com/details/64368646': 'Article 4',
    'https://science.altmetric.com/details/69584866': 'Article 5',
    'https://annals.altmetric.com/details/56459321': 'Article 6',
    'https://scienceadvances.altmetric.com/details/69530897': 'Article 7',
    'https://nature.altmetric.com/details/63584063': 'Article 8'
}

In [None]:
def fetch_blogs_and_mediasource(url):
    blogs_and_mediasource = []
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article', class_ = 'post blogs')

    for article in articles:
        blog_info = {}
        h3_tag = article.find('h3')
        h4_tag = article.find('h4')
        p_tag = article.find('p', class_ = 'summary')
        article_link = article.find('a', class_ = 'block_link')
        time_tag = article.find('time', datetime = True)

        if h3_tag:
            blog_info['title'] = h3_tag.text.strip()

        if h4_tag:
            full_text = h4_tag.text.strip()
            mediasource = full_text.split(",")[0]
            blog_info['mediasource'] = mediasource

        if p_tag:
            blog_info['subtitle'] = p_tag.text.strip()

        if time_tag:
            blog_info['date'] = time_tag.text.strip()
            
        if article_link:
            blog_info['url'] = article_link.get('href', 'N/A')
        else:
            blog_info['url'] = ''

        if blog_info:
            blogs_and_mediasource.append(blog_info)

    return blogs_and_mediasource

if __name__ == "__main__":
    blog_data = []
    
    for url in urls:
        print(f"Fetching data for URL: {url}")
        url1 = url + '/blogs'
        result = fetch_blogs_and_mediasource(url1)

        if result:
            for blog in result:
                row = {
                    'altmetric': url,
                    'article': article_map.get(url, 'N/A'),
                    'author': author_map.get(url, 'N/A'),
                    'mediatype': 'Blogs',
                    'medialink': blog.get('url', 'N/A'),
                    'mediasource': blog.get('mediasource', 'N/A'),
                    'mediaheadline': blog.get('title', 'N/A'),
                    'mediasubtitle': blog.get('subtitle', 'N/A'),
                    'date': blog.get('date', 'N/A'),
                }
                blog_data.append(row)
            print("Data fetched.")
        print("----------------------------")

    df_blog = pd.DataFrame(blog_data)


### Step 2: Detect language

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'
df_blog['language'] = df_blog['mediaheadline'].apply(detect_language)


def get_language_name(lang_code):
    try:
        return pycountry.languages.get(alpha_2=lang_code).name
    except AttributeError:
        return "Unknown"

df_blog['language'] = df_blog['language'].apply(get_language_name)


In [None]:
df_blog

### Step 3: Export blogs dataset

In [None]:
#df_blog.to_excel('Altmetric_blogs.xlsx', index=False)

## News
### Step 1: Find all news articles with an active link, headlines and subtitles

In [None]:
def fetch_news_and_mediasource(base_url, page_number=1):
    news_and_mediasource = []
    url = f"{base_url}/page:{page_number}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL {url}")
        return None, None

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article', class_ = 'post msm')

    for article in articles:
        news_info = {}
        h3_tag = article.find('h3')
        h4_tag = article.find('h4')
        p_tag = article.find('p', class_ = 'summary')
        article_link = article.find('a', class_ = 'block_link')
        time_tag = article.find('time', datetime = True)

        if h3_tag:
            news_info['title'] = h3_tag.text.strip()

        if h4_tag:
            full_text = h4_tag.text.strip()
            mediasource = full_text.split(",")[0]
            news_info['mediasource'] = mediasource

        if p_tag:
            news_info['subtitle'] = p_tag.text.strip()

        if article_link:
            news_info['url'] = article_link.get('href', 'N/A')
        else:
            news_info['url'] = ''
        
        if time_tag:
            news_info['date'] = time_tag.text.strip()

        if news_info:
            news_and_mediasource.append(news_info)

    next_page_tag = soup.find('a', class_ = 'next_page')
    next_page_number = None

    if next_page_tag:
        next_page_url = next_page_tag.get('href')
        next_page_number = next_page_url.split(":")[-1]

    return news_and_mediasource, next_page_number

if __name__ == "__main__":
    news_data = []

    for base_url in urls:
        base_url1 = base_url + '/news'
        current_page = 1
        while current_page:
            print(f"Fetching data for page: {current_page} from URL: {base_url}")
            result, next_page = fetch_news_and_mediasource(base_url1, current_page)

            if result:
                for news in result:
                    row = {
                        'altmetric': base_url,
                        'article': article_map.get(base_url, 'N/A'),
                        'author': author_map.get(base_url, 'N/A'),
                        'mediatype': 'News',
                        'medialink': news.get('url', 'N/A'),
                        'mediasource': news.get('mediasource', 'N/A'),
                        'mediaheadline': news.get('title', 'N/A'),
                        'mediasubtitle': news.get('subtitle', 'N/A'),
                        'date': news.get('date', 'N/A'),
                    }
                    news_data.append(row)
                print(f"Scraped {len(result)} news articles from page {current_page}")

            print("----------------------------")
            current_page = next_page
            time.sleep(1)

    df_news = pd.DataFrame(news_data)

### Step 2: Detect language 

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'
df_news ['language'] = df_news ['mediaheadline'].apply(detect_language)


def get_language_name(lang_code):
    try:
        return pycountry.languages.get(alpha_2=lang_code).name
    except AttributeError:
        return "Unknown"

df_news ['language'] = df_news['language'].apply(get_language_name)


In [None]:
df_news

### Step 3: Export news dataset

In [None]:
#df_news.to_excel('Altmetric_News.xlsx', index=False)

## Tweets
### Step 1: Extract all tweets and delete our posts

In [None]:
def fetch_tweets(base_url, page_number=1):
    tweets = []
    url = f"{base_url}/page:{page_number}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL {url}")
        return None, None

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article', class_ = 'post twitter')

    for article in articles:
        tweets_info = {}
        handle_tag = author_handle = article.find('div', class_ = 'handle')
        p_tag = article.find('p', class_ = 'summary')
        time_tag = article.find('time', datetime = True)

        if handle_tag:
            tweets_info['title'] = handle_tag.text.strip()

        if p_tag:
            tweets_info['headline'] = p_tag.text.strip()
            
        if time_tag:
            tweets_info['date'] = time_tag.text.strip()

        if tweets_info:
            tweets.append(tweets_info)

    next_page_tag = soup.find('a', class_ = 'next_page')
    next_page_number = None

    if next_page_tag:
        next_page_url = next_page_tag.get('href')
        next_page_number = next_page_url.split(":")[-1]

    return tweets, next_page_number

if __name__ == "__main__":
    tweets_data = []

    for base_url in urls:
        base_url1 = base_url + '/twitter'
        current_page = 1
        while current_page:
            print(f"Fetching data for page: {current_page} from URL: {base_url}")
            result, next_page = fetch_tweets(base_url1, current_page)

            if result:
                for tweets in result:
                    row = {
                        'altmetric': base_url,
                        'article': article_map.get(base_url, 'N/A'),
                        'author': author_map.get(base_url, 'N/A'),
                        'mediatype': 'Tweet',
                        'medialink': tweets.get('title', 'N/A'),
                        'mediasource': 'Twitter',
                        'mediaheadline': tweets.get('headline', 'N/A'),
                        'mediasubtitle': '',
                        'date': tweets.get('date', 'N/A')
                    }
                    tweets_data.append(row)
                print(f"Scraped {len(result)} tweets from page {current_page}")

            print("----------------------------")
            current_page = next_page
            time.sleep(1)

    df_tweets = pd.DataFrame(tweets_data)

In [None]:
# Detele out twitter posts
condition = df_tweets['medialink'].str.lower().str.contains('find|research')
df_tweets_c = df_tweets[~condition]
df_tweets_c = df_tweets_c.reset_index(drop = True)
df_tweets_c

### Step 2: Detect language

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'
df_tweets_c['language'] = df_tweets_c['mediaheadline'].apply(detect_language)


def get_language_name(lang_code):
    try:
        return pycountry.languages.get(alpha_2=lang_code).name
    except AttributeError:
        return "Unknown"

df_tweets_c['language'] = df_tweets_c['language'].apply(get_language_name)


### Step 3: Export tweets dataset

In [None]:
#df_tweets_c.to_excel('Altmetric_Tweets.xlsx', index=False)

## Demographic Information
### Step 1: Scrape all demographic information for each article

In [None]:
links = [
    'https://oxfordjournals.altmetric.com/details/72683542', # Article 2 - Gangwisch
    'https://science.altmetric.com/details/60552876', # Article 3 - Lee
    'https://jamanetwork.altmetric.com/details/64368646', # Article 4 - Kim
    'https://science.altmetric.com/details/69584866', # Article 5 - Mina
    'https://annals.altmetric.com/details/56459321', # Article 6 - Hviid
    'https://scienceadvances.altmetric.com/details/69530897', # Article 7 - Maxwell
    'https://nature.altmetric.com/details/63584063', # Article 8 - Brezaghi
]
scores = []
citations = []
readers = []
nums = []
demo_public = []
demo_scien = []
demo_pract = []
demo_sciencom = []
demo_unknown = []

def get_demo(type_name):
    demo_list = []
    found = False
    for row in rows:
        cell = row.find('td')
        if cell and cell.get_text(strip = True) == type_name:
            nums = [i.text for i in row.find_all('td', class_ = 'num')]
            demo_list.extend(nums)
            found = True
    if not found:
        demo_list.extend(['0', '0'])

    return demo_list

for index, link in enumerate (links, start = 2):
    text = requests.get(link).text
    soup = BeautifulSoup(text, 'html.parser')

    # Get the Altmetric score
    score = soup.find('div', class_ = 'altmetric-badge')
    s = score['style']
    s_start = s[s.find('score=')+6:]
    s_end = s_start[:s_start.find('&')]
    scores.append(s_end)


    # Get citations
    citation = soup.find('div', class_ = 'scholarly-citation-counts-wrapper')
    c = citation.find('strong').text
    citations.append(c)

    # Get readers
    reader = soup.find('div', class_ = 'reader-counts-wrapper')
    r = reader.find('strong').text
    readers.append(r)

    # Get the demographics infomation
    table = soup.find('div', class_ = 'table-wrapper users')
    rows = table.select('.table-wrapper.users table tr')
    
    demo_public += get_demo("Members of the public")
    demo_scien += get_demo("Scientists")
    demo_pract += get_demo("Practitioners (doctors, other healthcare professionals)")
    demo_sciencom += get_demo("Science communicators (journalists, bloggers, editors)")
    demo_unknown += get_demo("Unknown")

df_s = pd.DataFrame({
    'altmetric': links,
    'demo_public_count': [demo_public[i] for i in range(0, len(demo_public), 2)],
    'demo_public_perc': [demo_public[i] for i in range(1, len(demo_public), 2)],
    'demo_scientist_count': [demo_scien[i] for i in range(0, len(demo_scien), 2)],
    'demo_scientist_perc': [demo_scien[i] for i in range(1, len(demo_scien), 2)],
    'demo_scientistCommunicator_count': [demo_sciencom[i] for i in range(0, len(demo_sciencom), 2)],
    'demo_scientistCommunicator_perc': [demo_sciencom[i] for i in range(1, len(demo_sciencom), 2)],
    'demo_practitioners_count': [demo_pract[i] for i in range(0, len(demo_pract), 2)],
    'demo_practitioners_perc': [demo_pract[i] for i in range(1, len(demo_pract), 2)],
    'demo_unkown_count': [demo_unknown[i] for i in range(0, len(demo_unknown), 2)],
    'demo_unkown_perc': [demo_unknown[i] for i in range(1, len(demo_unknown), 2)],
})

In [None]:
df_s

### Step 2: Merge the demographic dataset with blogs, news and tweets datasets

In [None]:
def plus_demo(dataset):
    df_s['altmetric'] = df_s['altmetric'].astype(str)
    dataset['altmetric'] = dataset['altmetric'].astype(str)
    plus_demo = pd.merge(dataset, df_s, on = 'altmetric', how = 'left')
    return plus_demo

df_blog_demo = plus_demo(df_blog)
df_news_demo = plus_demo(df_news)
df_tweets_demo = plus_demo(df_tweets_c)

### Step 3: Concatenate three new datasets

In [None]:
df_all = pd.concat([df_blog_demo, df_news_demo, df_tweets_demo])
df_all

### Step 4: Export the final dataset

In [None]:
# df_all.to_excel('Altmetric_scrapedall.xlsx', index=False)

## Scraped five of the top 10 news media sources for detailed content

In [None]:
# Scrap contents for yahoo!news
links = df_news.loc[df_news['mediasource'] == 'Yahoo! News']['medialink']

for link in links:
    fetch = requests.get(link)
    if fetch.status_code != 200:
      print('This link is missing')
      print('\n----------------------------------------------\n')
      continue
    text = fetch.text
    soup = BeautifulSoup(text, 'html.parser')
    article = soup.find('div', class_ = 'caas-body')
    content = article.find_all('p')
    for paragraph in content:
      print(paragraph.text)
    print('\n----------------------------------------------\n')

In [None]:
# Scrap titles for New York Times
links = df_news.loc[df_news['mediasource'] == 'New York Times']['medialink']

head = {
      'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive',
    }

def fetch_link(link, head):
    req = Request(link, headers = head)
    return urlopen(req)

def fetch_content(link):
    content = fetch_link(link, head).read()
    content = str(content, encoding ='utf8')
    return content

for link in links:
    if (not type(link) is float) or (not math.isnan(link)):
      text = fetch_content(link)
      soup = BeautifulSoup(text, 'html.parser')
      article = soup.find('div', class_ = "css-s99gbd StoryBodyCompanionColumn")
      if article:
        content = article.find_all('p')
        for paragraph in content:
          print(paragraph.text)
        print('\n----------------------------------------------\n')

In [None]:
# Scrap titles for The Conversation
links = df_news.loc[df_news['mediasource'] == 'The Conversation']['medialink']

from urllib.request import Request, urlopen
head = {
      'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive',
    }

def fetch_link(link, head):
    req = Request(link, headers = head)
    return urlopen(req)

def fetch_content(link):
    content = fetch_link(link, head).read()
    content = str(content, encoding ='utf8')
    return content

for link in links:
    if (not type(link) is float) or (not math.isnan(link)):
      text = fetch_content(link)
      soup = BeautifulSoup(text, 'html.parser')
      article = soup.find('div', class_ = "grid-ten large-grid-nine grid-last content-body content entry-content instapaper_body inline-promos")
      if article:
        content = article.find_all('p')
        for paragraph in content:
          print(paragraph.text)
        print('\n----------------------------------------------\n')
      else:
        article = soup.find('div', class_ = "grid-ten large-grid-nine grid-last content-body content entry-content instapaper_body")
        content = article.find_all('p')
        for paragraph in content:
          print(paragraph.text)
        print('\n----------------------------------------------\n')

In [None]:
# The below two mediasources have the same websites layouts, so I use the same function for them
def contents(name):
    links = df_news.loc[df_news['mediasource'] == name]['medialink']

    for link in links:
        fetch = requests.get(link)
        if fetch.status_code != 200:
          print('This link is missing')
          print('\n----------------------------------------------\n')
          continue
        text = fetch.text
        soup = BeautifulSoup(text, 'html.parser')
        article = soup.find('div', class_ = 'content')
        content = article.find_all('p')
        for paragraph in content:
          print(paragraph.text)
        print('\n----------------------------------------------\n')

In [None]:
# Scrap contents for The Medical News
contents('The Medical News')

In [None]:
# Scrap contents forNewsbreak
contents('Newsbreak')