###Assignment 3 

##1

In [1]:
# import modules 
import requests
import xml.dom.minidom as m
import xml.etree.ElementTree as et
import json
import time

In [2]:
def get_id(disease):
    r = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={disease}+AND+2022[pdat]&retmax=1000&retmode=xml")
    time.sleep(1)
    doc = m.parseString(r.text)
    PubmedId = doc.getElementsByTagName('Id')
    IdList = []
    for i in range(len(PubmedId)):
        IdList.append(PubmedId[i].firstChild.data)

    return IdList

In [3]:
len(get_id("Alzheimers") + get_id("Cancer"))

2000

Finding an overlap between two sets of papers

In [4]:
def overlap_papers(disease1,disease2):
    IdList1 = get_id(disease1)
    IdList2 = get_id(disease2)
    set1 = set(IdList1)
    set2 = set(IdList2)
    overlap = list(set1&set2)
    if len(overlap) == 0:
        print("There is no overlap in the two sets of papers")
    elif len(overlap) == 1:
        print(f"There is a overlap in the two sets of papers, the Pubmed Id is {overlap[0]}")
        return overlap[0]
    else:
        print(f"There are overlaps in the two sets of papers, the Pubmed Ids are{overlap}")
        return overlap

In [5]:
overlap_papers('Alzheimers','cancer')

There are overlaps in the two sets of papers, the Pubmed Ids are['36321363', '36321615']


['36321363', '36321615']

Finding the Metadata of the papers in Alzheimers and Cancer sets

In [6]:
def find_metadata(disease):
    PubmedIdList = get_id(disease)
    disease_dictionary = {}
    for PubmedId in PubmedIdList:
        time.sleep(1)
        r = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={int(PubmedId)}")
        doc = m.parseString(r.text)

        ArticleTitle = doc.getElementsByTagName('ArticleTitle')
        Title = ""
        if len(ArticleTitle) > 0:
            for elm in ArticleTitle:
                for textmessage in elm.childNodes:
                    try:
                        Title += textmessage._get_wholeText()
                        # reference: https://docs.python.org/3/tutorial/errors.html
                        Title = et.tostring(Title, method = "text").decode()
                    
                    except AttributeError: 
                        for subnode in textmessage.childNodes:
                            if subnode.nodeType == m.Node.TEXT_NODE:
                                Title += subnode.data
     
        AbstractText = doc.getElementsByTagName('AbstractText')
        Abstract = ""
        if len(AbstractText) > 0:
            for elm in AbstractText:
                for textmessage in elm.childNodes:
                    try:
                        Abstract += textmessage._get_wholeText()
                        Abstract = et.tostring(Abstract, method = "text").decode()
                    except AttributeError: 
                        for subnode in textmessage.childNodes:
                            if subnode.nodeType == m.Node.TEXT_NODE:
                                Abstract += subnode.data

      
        MeshHeading = doc.getElementsByTagName('MeshHeading')
        ArticleMeshTerms = []
        if len(MeshHeading) > 0:
            try:
                for i in MeshHeading:
                    ArticleMeshTerms.append(i.firstChild.childNodes[0].nodeValue)
            except AttributeError: pass
            
        disease_dictionary[PubmedId] = {
            'ArticleTitle': Title,
            'ArticleAbstract': Abstract,
            'Query': disease,
            'Mesh': ArticleMeshTerms
        }
        
    return  disease_dictionary

Saving JSON files separatly of Alzheimers & Cancer 

In [None]:
alz_data = find_metadata('Alzheimers')
cancer_data = find_metadata('cancer')

# alz_data into a JSON file paper.json.
with open('alzheimers.json','w') as f:
    json.dump(alz_data,f)
    
# cancer_data into a JSON file paper.json.
with open('cancer.json', 'w') as f:
    json.dump(cancer_data, f)

In [None]:
alz_data

In [None]:
cancer_data

In [None]:
both_papers_data = find_metadata('Alzheimers')
cancer_data = find_metadata('cancer')
both_papers_data.update(cancer_data)

with open('combined.json','w') as f:
    json.dump(both_papers_data,f)

In [None]:
len(both_papers_data)

##2

In [None]:
conda install pytorch torchvision -c pytorch

In [None]:
pip install transformers

In [7]:
from transformers import AutoTokenizer, AutoModel
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

In [None]:
import numpy as np
import json
import pandas as pd

In [None]:
with open("alzheimers.json") as f:
    alz_meta = json.load(f)

alz_data_format = pd.DataFrame.from_dict(alz_meta, orient = 'index') 
alz_data_format.head()

In [None]:
with open("cancer.json") as f:
    cancer_meta = json.load(f)

cancer_data_frame = pd.DataFrame.from_dict(cancer_meta, orient = 'index') 
cancer_data_frame.head()

In [None]:
both_papers = pd.concat([alz_data_format,cancer_data_frame])

In [None]:
both_papers

In [None]:
# read JSON file using the open function.
with open('combined.json') as f:
    both_papers_data = json.load(f)

In [None]:
data = [paper["ArticleTitle"] + tokenizer.sep_token + paper["ArticleAbstract"] for paper in both_papers_data.values()]
inputs = tokenizer([data[0]], padding=True, truncation=True, return_tensors="pt", max_length=512)
result = model(**inputs)
embed_total = result.last_hidden_state[:, 0, :].detach().numpy()

for i in range(1,len(data)):
    inputs = tokenizer([data[i]], padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = model(**inputs)
    embed = result.last_hidden_state[:, 0, :].detach().numpy()
    embed_total = np.concatenate((embed_total, embed),axis = 0)
    

In [None]:
import pandas as pd
from sklearn import decomposition

In [None]:
# first 3 principal components. 
pca = decomposition.PCA(n_components=3)
embed_pca = pd.DataFrame(
    pca.fit_transform(embed_total),
    columns=['PC0', 'PC1', 'PC2']
)
embed_pca["Query"] = [paper["Query"] for paper in both_papers_data.values()]

In [None]:
embed_pca

In [None]:
import plotnine as p9

In [None]:
#PC0 vs PC1
(p9.ggplot(data = embed_pca, mapping = p9.aes(x='PC0', y='PC1'))
+ p9.geom_point(p9.aes(x = 'PC0', y = 'PC1', color = 'Query'))
+ p9.labs(title = "PC0 vs PC1"))

In [None]:
# PC0 vs PC2
(p9.ggplot(data = embed_pca, mapping = p9.aes(x='PC0', y='PC2'))
+ p9.geom_point(p9.aes(x = 'PC0', y = 'PC2', color = 'Query'))
+ p9.labs(title = "PC0 vs PC2"))

In [None]:
#PC1 vs PC2
(p9.ggplot(data = embed_pca, mapping = p9.aes(x='PC1', y='PC2'))
+ p9.geom_point(p9.aes(x = 'PC1', y = 'PC2', color = 'Query'))
+ p9.labs(title = "PC1 vs PC2"))

##3

In [None]:
import matplotlib.pyplot as plt
def plot_with_explicit_Eulers(s0,i0,r0,B,g, tMax):

    #Initialize parameters and arrays  
    numSteps=1000
    tStep=tMax/numSteps
    N=s0+i0+r0
    t=[0]*(numSteps+1)
    s=[0]*(numSteps+1)
    i=[0]*(numSteps+1)
    t[0]=0
    s[0]=s0
    i[0]=i0
    peak_not_reached=False
    
    #Calculate i over time range
    for j in range(1,numSteps+1):
        t[j]=t[j-1]+tStep
        s[j]=s[j-1]+(tStep)*(-B/N*s[j-1]*i[j-1])
        i[j]=i[j-1]+(tStep)*(B/N*s[j-1]*i[j-1]-g*i[j-1])
        #Check if peak reached
        if (i[j]<i[j-1] and not peak_not_reached):
            peak_day=t[j-1] #round(t[j-1]) to the nearest integer/day 
            peak_infections=round(i[j-1])
            peak_not_reached=True

    plt.plot(t,i,label="Infected People")
    return peak_day,peak_infections


In [None]:
[peak_day,peak_infections]=plot_with_explicit_Eulers(13399,1,0,2,1,30)
print("Peak day: ", peak_day)
print("Peak infections: ", peak_infections)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
Bs=[]
gs=[]
peak_days=[]
peak_infections=[]

for B in np.arange(1.8,2.2,0.05):
    for g in np.arange(0.9,1.1,0.02):
        [peak_day,peak_infection]=plot_with_explicit_Eulers(13399,1,0,B,g,100)
        Bs.append(B)
        gs.append(g)
        peak_days.append(peak_day)
        peak_infections.append(peak_infection)

In [None]:
dicts={"B":Bs,"g":gs,"peak_days":peak_days,"peak_infections":peak_infections}
data=pd.DataFrame(dicts)

In [None]:
#infection time in days
sns.set()
sns.heatmap(data.pivot("B","g","peak_days"))
plt.title("Heatmap of peak infection time in days as a function of Beta and gamma")

In [None]:
#infected people in days
sns.set()
sns.heatmap(data.pivot("B","g","peak_infections"))
plt.title("Heatmap of peak number of infected people in days as a function of Beta and gamma")

##4

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import datetime as dt
import warnings

In [None]:
df = pd.read_csv('/Users/polina/Desktop/Life Expectancy Data.csv')
df

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# Looking for null value in the data
df.isnull().sum()

In [None]:
# Replacing the Null Values with mean values of the data
from sklearn.impute import SimpleImputer
#reference: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
imputer=SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=None)
df['Life expectancy ']=imputer.fit_transform(df[['Life expectancy ']])
df['Adult Mortality']=imputer.fit_transform(df[['Adult Mortality']])
df['Alcohol']=imputer.fit_transform(df[['Alcohol']])
df['Hepatitis B']=imputer.fit_transform(df[['Hepatitis B']])
df[' BMI ']=imputer.fit_transform(df[[' BMI ']])
df['Polio']=imputer.fit_transform(df[['Polio']])
df['Total expenditure']=imputer.fit_transform(df[['Total expenditure']])
df['Diphtheria ']=imputer.fit_transform(df[['Diphtheria ']])
df['GDP']=imputer.fit_transform(df[['GDP']])
df['Population']=imputer.fit_transform(df[['Population']])
df[' thinness  1-19 years']=imputer.fit_transform(df[[' thinness  1-19 years']])
df[' thinness 5-9 years']=imputer.fit_transform(df[[' thinness 5-9 years']])
df['Income composition of resources']=imputer.fit_transform(df[['Income composition of resources']])
df['Schooling']=imputer.fit_transform(df[['Schooling']])

In [None]:
# Looking for null value in the data after fitting
df.isnull().sum()

In [None]:
# Changing/Renaming the columns for easy access.
df = df.rename(columns={'Country': 'country', 'Year': 'year', 'Status': 'status', 'Life expectancy ': 'life_expectancy', 'Adult Mortality': 'adult_mortality',
       'infant deaths':'infant_death', 'Alcohol':'alcohol', 'percentage expenditure': 'percentage_expenditure', 'Hepatitis B':'Hepatitis_b',
       'Measles ':'measles', ' BMI ':'bmi', 'under-five deaths ':'under_five_deaths', 'Polio':'polio', 'Total expenditure': 'total_expenditure','Diphtheria ':'diphtheria', ' HIV/AIDS':'hiv_Aids', 'GDP':'gdp', 'Population':'population',
       ' thinness  1-19 years':'thinness_1_to_19', ' thinness 5-9 years':'thinness_5_to_9',
       'Income composition of resources':'income_composition_of_resources', 'Schooling': 'schooling'})

In [None]:
# Looking for columns after rename
df.columns

In [None]:
#Distribution of Life Expectancy according to the age
fig = px.histogram(df, x = 'life_expectancy')
fig.show()

In [None]:
#Comparing the life expectancy of Developing and Developed Countries
fig = px.violin(df, x= 'status', y= 'life_expectancy',
                color = 'status',box = True,title='Life Expectancy on the Basis of Country Status')
fig.show()

In [None]:
#Country Wise Life Expectancy over the years
fig = px.line((df.sort_values(by = 'year')), x = 'year', y = 'life_expectancy',
    animation_frame= 'country',animation_group='year',color='country',
    markers=True,title='<b>Country Wise Life Expectancy over the years')
fig.show()

In [None]:
country_df = px.data.gapminder()
country_df.tail()

In [None]:
#Life Expectancy over the World Map
map_fig = px.scatter_geo(country_df,locations = 'iso_alpha', projection = 'orthographic', 
                         opacity = 0.8, color = 'country', hover_name = 'country', 
                         hover_data = ['lifeExp', 'year'],template = 'plotly_dark',title = '<b>Life Expectancy over the World Map')
map_fig.show()

Refrences
(Kumar R.(2017).Life Expectancy(WHO).from https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who))