### PLOS One search results (BeautifulSoup)

In [None]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests
import time, os
import random

In [None]:
# Search query
# Search terms: "fMRI study", "language", -"china", "-korea"
# Date range: 2012-01-01 to 2021-12-31 (10 years)

# Full URL is 'https://journals.plos.org/plosone/search/feed/atom?filterStartDate=2012-01-01&q=%22fmri+study%22+%22language%22+-china+-korea&filterSubjects=Functional+magnetic+resonance+imaging&filterArticleTypes=Research+Article&sortOrder=DATE_NEWEST_FIRST&filterJournals=PLoSONE&page=1&filterEndDate=2021-12-31'
# Leave the end date open for appending later
url_base = 'https://journals.plos.org/plosone/search/feed/atom?filterStartDate=2012-01-01&q=%22fmri+study%22+%22language%22+-china+-korea&filterSubjects=Functional+magnetic+resonance+imaging&filterArticleTypes=Research+Article&sortOrder=DATE_NEWEST_FIRST&filterJournals=PLoSONE&page=1&filterEndDate='

In [None]:
# Since appending '&page=2', '=3' etc. doesn't shorten the query results,
# We cut off the most recent results by stepping "filterEndDate" by two months

years = [str(x) for x in range(2021,2011,-1)]
months = [str(x) for x in range(12,0,-2)]
day = '28'

monthdays = [month + '-' + day for month in months]

enddates = []

for year in years:
    for monthday in monthdays:
        enddate = year + '-' + monthday
        enddates.append(enddate)

# Now for the complete list of URLs
search_url_list = [url_base + enddate for enddate in enddates]

# Set up 'Subject Area' labels for later
SA_keys = ['SA'+str(x) for x in range(1,9)]  #'SA1', 'SA2', etc.

In [None]:
def get_article_info(res_url):
    
    response = requests.get(res_url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    df = pd.DataFrame([], columns = SA_keys)
    
    for entry in soup.find_all('entry'):        
        Title = {'Title': entry.title.text}
        url = str(entry.link)[12:85]
        URL = {'URL': url}
        
        INFO = {}
        INFO.update(Title)
        INFO.update(URL)
        
        # Get the 8 "Subject Areas" given for each PLOS One article
        # Note 'url' is for a specific article; 'res_url' for query results page of 15 articles

        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")

        SA_vals = []

        for entry in soup.find_all('a', class_='taxo-term'): 
            term = entry.text
            SA_vals.append(term)
        
        SAs = dict(zip(SA_keys, SA_vals))
        INFO.update(SAs)
        
        df_row = pd.DataFrame([INFO])
        df = pd.concat([df, df_row])
           
    return df            

In [None]:
# Now for the scraping

df = pd.DataFrame()

for i, url in enumerate(search_url_list):
    df_scrape = get_article_info(url)
    df = pd.concat([df, df_scrape])
    df = df.drop_duplicates(subset=['Title'], keep='first')
    
    wait = .5 + 10 * random.random()
    time.sleep(wait)
    print(f'{enddates[i]} - waited {wait:0.4} sec.')

df_en = df[['Title', 'URL'] + SA_keys]
df_en

In [None]:
# For JP articles

df_jp = pd.read_csv('refs/jprefs.csv')

jp_titles = list(df_jp['Title'])
jp_urls = list(df_jp['URL'])

df_jp = pd.DataFrame()

for i, url in enumerate(jp_urls):
    
    INFO = {}

    Title = {'Title': jp_titles[i]}
    URL = {'URL': url}
    
    INFO.update(Title)
    INFO.update(URL)
    
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    SA_keys = ['SA'+str(x) for x in range(1,9)]  #'SA1', 'SA2', etc.
    SA_vals = []

    for entry in soup.find_all('a', class_='taxo-term'): 
        term = entry.text
        SA_vals.append(term)

    SAs = dict(zip(SA_keys, SA_vals))
    INFO.update(SAs)

    df_row = pd.DataFrame([INFO])
    df_jp = pd.concat([df_jp, df_row])

df_jp  

In [None]:
# save your work
df_en.to_pickle('savefiles/df_en.pkl')
df_jp.to_pickle('savefiles/df_jp.pkl')

### Determining EN articles closest to JP articles by keyword (SA) similarity

In [None]:
# load your work
df_en = pd.read_pickle('savefiles/df_en.pkl')
df_jp = pd.read_pickle('savefiles/df_jp.pkl')

In [None]:
# Set up
df_en['ID'] = 'EN'
df_en = df_en[['ID'] + list(df_en.columns)]
df_en = df_en.reset_index(drop=True)
df_en

In [None]:
# Set up
df_jp['ID'] = 'JP'
df_jp = df_jp[['ID'] + list(df_jp.columns)]
df_jp = df_jp.reset_index(drop=True)
df_jp

In [None]:
# Each JP article will be searched in turn
# Every EN article will need to be 'scored' with respect to each JP row
keywords_jp = set(df_jp.loc[0, SA_keys].values)
keywords_jp

In [None]:
def score_df_en_by_jp_article(keywords_jp):

    for index, row in df_en.iterrows():
        score = 0
        keywords = list(row[SA_keys])
        for word in keywords:
            if word in keywords_jp:
                score += 1
        df_en.loc[index, 'SCORE'] = score
        
    df_sorted = df_en.sort_values(by='SCORE', ascending=False)
    return df_sorted

In [None]:
for index, row in df_jp.iterrows():
    keywords_jp = set(df_jp.loc[index, SA_keys].values)
    df_score = score_df_en_by_jp_article(keywords_jp)
    print(f"{index}. {row['Title']}")
    print(df_score[['Title', 'URL', 'SCORE']].iloc[:5, :].values)
    
# Partner papers decided based on this output!