In [11]:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
from pybliometrics import scopus
from BibAnalysis.main import Corpus
from BibAnalysis import Bibliography_Parse
import matplotlib.pyplot as plt
from geopy.geocoders import Photon
from collections import Counter
from matplotlib_venn_wordcloud import venn3_wordcloud, venn2_wordcloud
import os
global disciplines
disciplines = ['AB', 'BA', 'SBE']

# Step 1: Load Data

In [2]:
def pull_scopus_data(update=False):
    if update: #setting update to True will reindex scopus for any new entries since the last pull with update=True
        '''Define final queries'''
        final_AB_query = open('Data/Text/AB_query.txt', 'r').read()
        final_BA_query = open('Data/Text/BA_query.txt', 'r').read()
        final_SBE_query = open('Data/Text/SBE_query.txt', 'r').read()


        '''Load query data into dataframes'''
        SBE_entries = pd.DataFrame(scopus.ScopusSearch(final_SBE_query, subscriber=True, view='COMPLETE').results)
        AB_entries = pd.DataFrame(scopus.ScopusSearch(final_AB_query, subscriber=True, view='COMPLETE').results)
        BA_entries = pd.DataFrame(scopus.ScopusSearch(final_BA_query, subscriber=True, view='COMPLETE').results)

        '''Label data by discipline before merging tables'''
        SBE_entries['Discipline'] = 'SBE' 
        AB_entries['Discipline'] = 'AB'
        BA_entries['Discipline'] = 'BA'

        '''Merge dataframes'''
        bioscience_bibliography = pd.concat([SBE_entries, AB_entries, BA_entries])
        bioscience_bibliography.to_csv('Data/Tabular/bioscience_bibliography.csv')
    
    bioscience_bibliography = pd.read_csv('Data/Tabular/bioscience_bibliography.csv')
    return bioscience_bibliography

bioscience_bibliography=pull_scopus_data(update=False)

#assign standard hex codes to global namespace
color_dict={'Astrobiology': '#2FD03D', 'Bioastronautics': '#3D2FD0', 'Space Bioprocess Engineering': '#D03D2F'}
color_list = list([x for x in color_dict.values()])
print(bioscience_bibliography.columns)

Index(['Unnamed: 0', 'eid', 'doi', 'pii', 'pubmed_id', 'title', 'subtype',
       'subtypeDescription', 'creator', 'afid', 'affilname',
       'affiliation_city', 'affiliation_country', 'author_count',
       'author_names', 'author_ids', 'author_afids', 'coverDate',
       'coverDisplayDate', 'publicationName', 'issn', 'source_id', 'eIssn',
       'aggregationType', 'volume', 'issueIdentifier', 'article_number',
       'pageRange', 'description', 'authkeywords', 'citedby_count',
       'openaccess', 'freetoread', 'freetoreadLabel', 'fund_acr', 'fund_no',
       'fund_sponsor', 'Discipline'],
      dtype='object')


# Step 2: Clean Keywords

In [None]:
# Create a copy of bioscience_bibliography
def clean_bibliography(bioscience_bibliography, explode_phrases=False):
    kw_df = bioscience_bibliography.copy()

    # Split and clean author keywords
    kw_df['Cleaned Author Keywords'] = kw_df.authkeywords.str.split('|')
    kw_df = kw_df.dropna(subset=['Cleaned Author Keywords'])

    # Explode kw_df over the Cleaned Author Keywords column
    kw_df = kw_df.explode('Cleaned Author Keywords')
    kw_df['Cleaned Author Keywords'] = kw_df['Cleaned Author Keywords'].apply(lambda word: word.lower().strip())

    if explode_phrases:
        kw_df['Cleaned Author Keywords'] = kw_df['Cleaned Author Keywords'].str.strip().str.split(' ')

        #Explode kw_df again over the Cleaned Author Keywords column to handle words separated by spaces
        kw_df = kw_df.explode('Cleaned Author Keywords')

        #Remove non-alphanumeric characters from the Cleaned Author Keywords column
        kw_df['Cleaned Author Keywords'] = kw_df['Cleaned Author Keywords'].str.replace(r'\W+', '', regex=True)
        
    return kw_df

kw_df = clean_bibliography(bioscience_bibliography, explode_phrases=False)
kw_df.loc[:, ['Discipline', 'Cleaned Author Keywords']].head()

Step 3: Create Keyword Table (optionally, save to csv)

In [17]:
def generate_keyword_table(kw_df, save_to_csv=True, path='./Space Biosciences Author Keyword Table', nrows: int=None, rounding: int=2):
    discipline_kw_df_dictionary = {discipline:kw_df.loc[kw_df.Discipline==discipline, :].groupby(['Cleaned Author Keywords']).count().sort_values('title') for discipline in disciplines}
    discipline_kw_df_list = []
    discipline_corpus_sizes = dict()
    total_keyword_count = 0
    for discipline, df in discipline_kw_df_dictionary.items():
        df[f'Count in {discipline}'] = df['eid']
        discipline_corpus_sizes[discipline] = df[f'Count in {discipline}'].sum()
        df[f'Prevalence in {discipline} (%)'] = df[f'Count in {discipline}']/discipline_corpus_sizes[discipline]*100
        print(f'{discipline} Corpus Size: {discipline_corpus_sizes[discipline]}')
        new_df = df.loc[:, [f'Count in {discipline}', f'Prevalence in {discipline} (%)']]
        discipline_kw_df_list.append(new_df)
        total_keyword_count+=discipline_corpus_sizes[discipline]
    final_df = pd.merge(discipline_kw_df_list[0], pd.merge(discipline_kw_df_list[1], discipline_kw_df_list[2], how='outer', on='Cleaned Author Keywords'), how='outer', on='Cleaned Author Keywords')
    final_df.fillna(0, inplace=True)
    final_df['Scaled Prevalence (%)'] = (final_df['Prevalence in AB (%)']/3+final_df['Prevalence in BA (%)']/3+final_df['Prevalence in SBE (%)']/3)
    final_df = final_df.sort_values(['Scaled Prevalence (%)'], ascending=[False])
    for discipline in disciplines:
        final_df[f'Count in {discipline}'] = final_df[f'Count in {discipline}'].astype(int)
    final_df['Scaled Prevalence (%)'].sum()
    if save_to_csv:
        final_df.to_csv(path)
    if nrows:
        final_df = final_df.head(nrows).round(rounding)
    return final_df

generate_keyword_table(kw_df, save_to_csv=True, nrows=20, rounding=2)

AB Corpus Size: 49792
BA Corpus Size: 27650
SBE Corpus Size: 1071


Unnamed: 0_level_0,Count in AB,Prevalence in AB (%),Count in BA,Prevalence in BA (%),Count in SBE,Prevalence in SBE (%),Scaled Prevalence (%)
Cleaned Author Keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
astrobiology,2234,4.49,26,0.09,10,0.93,1.84
microgravity,19,0.04,1260,4.56,4,0.37,1.66
mars,717,1.44,85,0.31,18,1.68,1.14
origin of life,1497,3.01,0,0.0,1,0.09,1.03
spaceflight,12,0.02,660,2.39,1,0.09,0.83
prebiotic chemistry,657,1.32,0,0.0,0,0.0,0.44
in situ resource utilization,2,0.0,8,0.03,13,1.21,0.42
international space station,31,0.06,166,0.6,5,0.47,0.38
moon,62,0.12,71,0.26,7,0.65,0.34
space exploration,29,0.06,62,0.22,8,0.75,0.34
