<h1> Python Fundamentals <h1>

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pylab as plt
from dataprep.clean import clean_country
import pygal
import folium

<h3>Merge Two CSV Files <h3>

In [7]:
# merge the two csv files together into a data frame
articleInfo = pd.read_csv("articleInfo.csv")
authorInfo = pd.read_csv("authorInfo.csv")

mergedInfo = pd.merge(articleInfo, authorInfo, 
                   on="Article No.", 
                   how="inner")
mergedInfo.fillna(0)


Unnamed: 0,Article No.,Title,Year,Author Number,Key Words,Citation,Source,Abstract,Type,Author Name,Author Affiliation,Country,h-index
0,1,Study of Virtual Reality Immersive Technology ...,2022,3.0,"virtual reality technologies, mathematics lear...",7.0,FRONTIERS IN PSYCHOLOGY,Mathematics is an important foundation for the...,Journal,Yu-Sheng Su,National Taiwan Ocean University,Taiwan,0.0
1,1,Study of Virtual Reality Immersive Technology ...,2022,3.0,"virtual reality technologies, mathematics lear...",7.0,FRONTIERS IN PSYCHOLOGY,Mathematics is an important foundation for the...,Journal,Hung-Wei Cheng,National Taiwan Ocean University,Taiwan,0.0
2,1,Study of Virtual Reality Immersive Technology ...,2022,3.0,"virtual reality technologies, mathematics lear...",7.0,FRONTIERS IN PSYCHOLOGY,Mathematics is an important foundation for the...,Journal,Chin-Feng Lai,National Cheng Kung University,Taiwan,0.0
3,2,Factors Influencing Nursing Students' Immersiv...,2021,2.0,virtual reality; learning; immersive media tec...,3.0,SENSORS,Background/objectives: This study aims to iden...,Journal,Youngju Kim,Daejeon Health Institute of Technology,Korea,0.0
4,2,Factors Influencing Nursing Students' Immersiv...,2021,2.0,virtual reality; learning; immersive media tec...,3.0,SENSORS,Background/objectives: This study aims to iden...,Journal,Sung Yun Ahn,Pai Chai University,Korea,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,81,Teaching Enzyme Catalysis Using Interactive Mo...,2019,7.0,"Graduate Education/Research, Biochemistry, Com...",85.0,JOURNAL OF CHEMICAL EDUCATION,The reemergence of virtual reality (VR) in the...,Journal,Kenny Miller,Boise State University,USA,0.0
384,81,Teaching Enzyme Catalysis Using Interactive Mo...,2019,7.0,"Graduate Education/Research, Biochemistry, Com...",85.0,JOURNAL OF CHEMICAL EDUCATION,The reemergence of virtual reality (VR) in the...,Journal,Shalini Ramachandran,Chapman University,USA,2.0
385,81,Teaching Enzyme Catalysis Using Interactive Mo...,2019,7.0,"Graduate Education/Research, Biochemistry, Com...",85.0,JOURNAL OF CHEMICAL EDUCATION,The reemergence of virtual reality (VR) in the...,Journal,Sheree Fu,California State University Los Angeles,USA,0.0
386,81,Teaching Enzyme Catalysis Using Interactive Mo...,2019,7.0,"Graduate Education/Research, Biochemistry, Com...",85.0,JOURNAL OF CHEMICAL EDUCATION,The reemergence of virtual reality (VR) in the...,Journal,Karen Howell,University of Southern California,USA,0.0


<h3> Plot the <i>yearly_publication<i> figure <h3>

In [None]:
# plot the yearly publication figure: x - axis = year, y -axis = # articles published during that year
yearPub = mergedInfo.loc[:,"Year"].to_list() 
numArticles = {i:yearPub.count(i) for i in yearPub}

lists = sorted(numArticles.items()) # sorted by key, return a list of tuples

x, y = zip(*lists) # unpack a list of pairs into two tuples

plt.xlabel('Year')
plt.ylabel('Number of Articles Published')
plt.title("yearly_publication")

plt.plot(x, y)
plt.show()

<h3> Plot the <i>yearly_citation<i> figure <h3>

In [None]:
# plot the yearly citation figure: x axis is the year and the y axis is the total # of citations in that year
yearPub = mergedInfo.loc[:,"Year"].to_list()
citations = mergedInfo.loc[:,"Citation"].to_list()

# create new data frame with information we want
temp = {"Year": yearPub, "Citation" : citations}
newDF = pd.DataFrame(temp)

# get the sum of the citations for each year and convert to dictionary
citations = newDF.groupby('Year')['Citation'].sum().to_dict()
x = citations.keys()
y = citations.values()

# plot the data
plt.plot(x, y)
plt.xlabel("Year")
plt.ylabel("Total No. of Citations")
plt.title("yearly_citation")
plt.show()


<h3> Plot the number of publications across countries<h3>

In [None]:
# get the number of publications per country
countriesPB = mergedInfo.loc[:,"Country"].to_list()
numCountries = {i:countriesPB.count(i) for i in countriesPB}

keys = numCountries.keys()
value = numCountries.values()

# convert the names to iso standard
df = pd.DataFrame({"Countries" : keys, "Number of Publications" : value})
df = clean_country(df, "Countries",  input_format=("name", "official", "alpha-3"))
# get the alpha code for the country in order to make into map
df = clean_country(df, "Countries_clean", output_format="alpha-3")
# if nan -> remove
df = df.dropna()

# create a new dataframe and use to plot the map
publicationData = df[['Countries_clean_clean', 'Number of Publications']].copy()
publicationData.columns = ['CODE', "Num of Publications"]


world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.columns = ['pop_est', 'continent',
                 'name', 'CODE', 'gdp_md_est', 'geometry']
merge = pd.merge(world, publicationData, on='CODE', how = 'inner')
worldData = gpd.GeoDataFrame(merge)

m = worldData.explore(
    column="Num of Publications",
     scheme="naturalbreaks",
     legend=False, 
     k=10, 
     legend_kwds=dict(colorbar=False),
     name="countries"
)

folium.TileLayer('Stamen Toner', control=True).add_to(m)  # use folium to add alternative tiles
folium.LayerControl().add_to(m)  # use folium to add layer control

m  # show map
  


<h3> Top 5 Institutions and Researchers <h3>

In [11]:
# top 5 institutions and top 5 researchers
# collect institution information and make dataframe
institution = mergedInfo.loc[:, "Author Affiliation"].to_list()
numInstitutions = {i: institution.count(i) for i in institution}
institutionKeys = numInstitutions.keys()
institutionValue = numInstitutions.values()
institutionDF = pd.DataFrame({"Institution": institutionKeys, "Number of Publications": institutionValue})
institutionDF = institutionDF.sort_values(by = 'Number of Publications', ascending=False)

# collect researcher information and make dataframe
researcher = mergedInfo.loc[:, "Author Name"].to_list()
numResearcher = {i: researcher.count(i) for i in researcher}
researcherKeys = numResearcher.keys()
researcherValue = numResearcher.values()
researcherDF = pd.DataFrame(
    {"Researcher": researcherKeys, "Number of Publications": researcherValue})
researcherDF = researcherDF.sort_values(by='Number of Publications', ascending= False)

display(institutionDF)
display(researcherDF)


Unnamed: 0,Institution,Number of Publications
4,University of the Western Cape,17
69,Masaryk University,12
32,University College Cork,11
96,Intel Corporation,11
59,Madigan Army Medical Center,8
...,...,...
30,Tangshan Normal University,1
95,Huanghuai University,1
31,Xingtai University,1
91,International Ataturk Alatoo University,1


Unnamed: 0,Researcher,Number of Publications
251,Mian Usman Sattar,2
252,Sellappan Palaniappan,2
122,Guido Makransky,2
304,Chen-Wei Chang,2
135,Emilia Biffi,2
...,...,...
125,Mary Kynn,1
124,Ann L. Parkinson,1
123,Lau Lilleholt,1
121,Richard Go ̈llner,1
