In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
def extract_page_data(driver):
    results = []
    page_source = driver.page_source
    
    soup = BeautifulSoup(page_source, 'html.parser')
    res = soup.find_all("div", class_= "docsum-content")
    for result in res:
        title = result.find("a", class_="docsum-title")
        authors = result.find("span", class_="docsum-authors full-authors")
        abstract = result.find("div", class_="full-view-snippet")
        pmid = result.find("span", class_="docsum-pmid")
        citation = result.find("span", class_ = "docsum-journal-citation short-journal-citation")

        if title and authors:
            title_text = title.text.strip()
            authors_text = authors.text.strip()
            abstract_text = abstract.text.strip() if abstract else "Abstract not available"
            pmid = pmid.text.strip()
            journal = citation.text.strip()
            year, journal_ = int(journal.split(" ")[-1].replace('.', '')), " ".join(journal.split(" ")[:-1])
            
            results.append({
                "Title": title_text,
                "Authors": authors_text,
                "Abstract": abstract_text,
                "PMID": pmid,
                "Journal": journal_,
                "Year":year
            })
    return results


In [4]:
driver = webdriver.Chrome()

if __name__ == "__main__":
    final_results = []
    search_term = ["Network science", "cryptography", "quantum physics", "neural network", "machine learning", "information retrieval", "compression"]
    driver.get('https://pubmed.ncbi.nlm.nih.gov/')

    for term in search_term:
        search_input = driver.find_element(By.ID, "id_term")
        search_input.send_keys(Keys.CONTROL + "a")  # Select all text in the input field
        search_input.send_keys(Keys.DELETE)  
        search_input.send_keys(term)
        driver.find_element(By.CLASS_NAME, "search-btn").click()
        for i in range(10):
            page_output = extract_page_data(driver)
            final_results.extend(page_output)
            
            next_page = driver.find_element(By.CLASS_NAME, "next-page-btn")
            if "disabled" in next_page.get_attribute("class"):
                break
            
            next_page.click()
            time.sleep(2)
    driver.quit()

In [5]:
df = pd.DataFrame(final_results)
df

Unnamed: 0,Title,Authors,Abstract,PMID,Journal,Year
0,Network neuroscience.,"Bassett DS, Sporns O.",Two parallel trends drive the approach: the av...,28230844,Nat Neurosci.,2017
1,The network science of collective intelligence.,Centola D.,"In the last few years, breakthroughs in comput...",36180361,Trends Cogn Sci.,2022
2,The Role of Network Science in Glioblastoma.,"Lopes MB, Martins EP, Vinga S, Costa BM.",Network science has long been recognized as a ...,33801334,Cancers (Basel).,2021
3,Cognitive Network Science for Understanding On...,Stella M.,Since social media data come from users' minds...,34118113,Top Cogn Sci.,2022
4,Network science.,Barabási AL.,Professor Barabasi's talk described how the to...,23419844,Philos Trans A Math Phys Eng Sci.,2013
...,...,...,...,...,...,...
695,Fixed and dynamic urethral compression for the...,"Madjar S, Raz S, Gousse AE.",Therapeutic fixed and dynamic urethral compres...,11458038,J Urol.,2001
696,Highly compressible glass-like supramolecular ...,"Huang Z, Chen X, O'Neill SJK, Wu G, Whitaker D...",Previous studies have focused on optimizing su...,34819661,Nat Mater.,2022
697,On-Demand Indexing for Referential Compression...,"Alves F, Cogo V, Wandelt S, Leser U, Bessani A.",The general idea is to store in the compressed...,26146838,PLoS One.,2015
698,Effect of Emergency Department Mattress Compre...,"Cheng A, Belanger C, Wan B, Davidson J, Lin Y.",INTRODUCTION: Cardiopulmonary resuscitation (C...,28697056,Simul Healthc.,2017


In [95]:
if df.iloc[27]['Abstract'] == '':
    print(True)

True


In [6]:
df = df[df['Abstract'].str.strip() != '']
df

Unnamed: 0,Title,Authors,Abstract,PMID,Journal,Year
0,Network neuroscience.,"Bassett DS, Sporns O.",Two parallel trends drive the approach: the av...,28230844,Nat Neurosci.,2017
1,The network science of collective intelligence.,Centola D.,"In the last few years, breakthroughs in comput...",36180361,Trends Cogn Sci.,2022
2,The Role of Network Science in Glioblastoma.,"Lopes MB, Martins EP, Vinga S, Costa BM.",Network science has long been recognized as a ...,33801334,Cancers (Basel).,2021
3,Cognitive Network Science for Understanding On...,Stella M.,Since social media data come from users' minds...,34118113,Top Cogn Sci.,2022
4,Network science.,Barabási AL.,Professor Barabasi's talk described how the to...,23419844,Philos Trans A Math Phys Eng Sci.,2013
...,...,...,...,...,...,...
695,Fixed and dynamic urethral compression for the...,"Madjar S, Raz S, Gousse AE.",Therapeutic fixed and dynamic urethral compres...,11458038,J Urol.,2001
696,Highly compressible glass-like supramolecular ...,"Huang Z, Chen X, O'Neill SJK, Wu G, Whitaker D...",Previous studies have focused on optimizing su...,34819661,Nat Mater.,2022
697,On-Demand Indexing for Referential Compression...,"Alves F, Cogo V, Wandelt S, Leser U, Bessani A.",The general idea is to store in the compressed...,26146838,PLoS One.,2015
698,Effect of Emergency Department Mattress Compre...,"Cheng A, Belanger C, Wan B, Davidson J, Lin Y.",INTRODUCTION: Cardiopulmonary resuscitation (C...,28697056,Simul Healthc.,2017


In [7]:
df.to_csv("F://IR/Data700.csv")

In [3]:
csv_file = 'F://IR/Data/Data700.csv'
df = pd.read_csv(csv_file)

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Title,Authors,Abstract,PMID,Journal,Year
0,0,Network neuroscience.,"Bassett DS, Sporns O.",Two parallel trends drive the approach: the av...,28230844,Nat Neurosci.,2017
1,1,The network science of collective intelligence.,Centola D.,"In the last few years, breakthroughs in comput...",36180361,Trends Cogn Sci.,2022
2,2,The Role of Network Science in Glioblastoma.,"Lopes MB, Martins EP, Vinga S, Costa BM.",Network science has long been recognized as a ...,33801334,Cancers (Basel).,2021
3,3,Cognitive Network Science for Understanding On...,Stella M.,Since social media data come from users' minds...,34118113,Top Cogn Sci.,2022
4,4,Network science.,Barabási AL.,Professor Barabasi's talk described how the to...,23419844,Philos Trans A Math Phys Eng Sci.,2013
...,...,...,...,...,...,...,...
571,695,Fixed and dynamic urethral compression for the...,"Madjar S, Raz S, Gousse AE.",Therapeutic fixed and dynamic urethral compres...,11458038,J Urol.,2001
572,696,Highly compressible glass-like supramolecular ...,"Huang Z, Chen X, O'Neill SJK, Wu G, Whitaker D...",Previous studies have focused on optimizing su...,34819661,Nat Mater.,2022
573,697,On-Demand Indexing for Referential Compression...,"Alves F, Cogo V, Wandelt S, Leser U, Bessani A.",The general idea is to store in the compressed...,26146838,PLoS One.,2015
574,698,Effect of Emergency Department Mattress Compre...,"Cheng A, Belanger C, Wan B, Davidson J, Lin Y.",INTRODUCTION: Cardiopulmonary resuscitation (C...,28697056,Simul Healthc.,2017


In [5]:
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
import re
import pickle
tokenizer = nltk.tokenize.toktok.ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [6]:
def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9s]'
    text = re.sub(pattern," ",''.join(text))
    tokens = tokenizer.tokenize(text)
    tokens = [tok.strip() for tok in tokens]
    if is_lower_case:
        cleaned_token = [tok for tok in tokens if tok not in stopword_list]
    else:
        cleaned_tokens = [tok for tok in tokens if tok.lower() not in stopword_list]
    filtered_text = ' '.join(cleaned_tokens)
    return filtered_text

In [7]:
glove_vectors = {}
glove_file = 'F://IR/glove6B/glove.6B.300d.txt'
file = open(glove_file, encoding = 'utf-8')
for line in file:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:])
    glove_vectors[word] = vectors
file.close()

In [5]:
glove_vectors["machine"]    

array(['-0.20598', '0.84199', '0.32655', '-0.5508', '-0.71978', '0.4463',
       '0.65963', '-0.083757', '0.31733', '-1.6928', '0.18901',
       '-0.56237', '0.6096', '-0.10732', '0.36826', '-0.053698',
       '0.041387', '-0.28029', '-0.43242', '0.36917', '0.56726',
       '0.39637', '0.15227', '-0.11159', '0.20497', '0.25431', '0.57174',
       '-0.24691', '0.26312', '-0.076046', '-0.26779', '0.048289',
       '0.31213', '0.51489', '-0.13457', '0.53894', '0.15186', '-0.21444',
       '-0.2392', '0.29854', '-0.073615', '0.44198', '-0.13692',
       '-0.22565', '0.46039', '0.1428', '0.70316', '-0.23107', '0.63721',
       '0.17203', '0.21067', '0.32497', '-0.02218', '0.5431', '-0.88348',
       '-0.53835', '-0.044051', '-0.12978', '0.31842', '0.31618',
       '0.22351', '0.021186', '-0.05058', '0.0056521', '-0.021808',
       '0.61592', '-0.71762', '0.20032', '0.2207', '0.32781', '0.27289',
       '0.37679', '0.46449', '0.12419', '0.014284', '0.36029', '-0.11143',
       '0.037234', '0

In [6]:
with open('F://IR/glove_vectors.pkl', 'wb') as file:
    pickle.dump(glove_vectors, file)

In [3]:
with open('F://IR/glove_vectors.pkl', 'rb') as file:
    glove_vectors1 = pickle.load(file)

In [4]:
glove_vectors1["machine"]    

array(['-0.20598', '0.84199', '0.32655', '-0.5508', '-0.71978', '0.4463',
       '0.65963', '-0.083757', '0.31733', '-1.6928', '0.18901',
       '-0.56237', '0.6096', '-0.10732', '0.36826', '-0.053698',
       '0.041387', '-0.28029', '-0.43242', '0.36917', '0.56726',
       '0.39637', '0.15227', '-0.11159', '0.20497', '0.25431', '0.57174',
       '-0.24691', '0.26312', '-0.076046', '-0.26779', '0.048289',
       '0.31213', '0.51489', '-0.13457', '0.53894', '0.15186', '-0.21444',
       '-0.2392', '0.29854', '-0.073615', '0.44198', '-0.13692',
       '-0.22565', '0.46039', '0.1428', '0.70316', '-0.23107', '0.63721',
       '0.17203', '0.21067', '0.32497', '-0.02218', '0.5431', '-0.88348',
       '-0.53835', '-0.044051', '-0.12978', '0.31842', '0.31618',
       '0.22351', '0.021186', '-0.05058', '0.0056521', '-0.021808',
       '0.61592', '-0.71762', '0.20032', '0.2207', '0.32781', '0.27289',
       '0.37679', '0.46449', '0.12419', '0.014284', '0.36029', '-0.11143',
       '0.037234', '0

In [8]:
vec_dimension = 300
def get_embedding(x):
    arr  = np.zeros(vec_dimension)
    text = str(x).split()
    for t in text:
        try:
            vec = glove_vectors.get(t).astype(float)
            arr = arr + vec
        except:
            pass
    arr = arr.reshape(1,-1)[0]
    return (arr/len(text))

In [9]:
out_dict = {}
for sen in zip(df["PMID"], df["Abstract"]):
    if sen[1]:
        average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen[1]))]), axis=0))
        dict = { sen[0] : (average_vector) }
        out_dict.update(dict)

In [None]:
out_dict

{28230844: array([-2.63372241e-01,  2.54100690e-02,  5.15984483e-02, -1.01535310e-01,
         1.29430483e-01,  1.19839017e-01, -1.16698152e-01, -1.78261724e-02,
         3.75707759e-02, -1.47193238e+00,  1.25243883e-01,  1.10571193e-01,
        -2.72796897e-02,  4.16539172e-02,  2.02280793e-01, -9.50255172e-02,
        -1.00703800e-01, -8.81110345e-03, -8.29048276e-02,  8.75359655e-02,
        -4.38403724e-02,  1.29298090e-01,  2.23379310e-02,  1.65557138e-01,
         3.39613793e-02,  8.96747069e-02,  9.28308621e-02,  3.38675138e-02,
        -3.47292414e-02,  6.56852172e-02,  1.17505276e-01,  2.65830100e-01,
        -1.71032345e-01,  1.66558866e-01, -1.47739266e-01,  2.24333172e-01,
         1.23808276e-01,  2.86523307e-02, -5.20578414e-02,  3.13862414e-02,
        -5.51391379e-02,  9.37113317e-02,  3.65857931e-02, -7.38290000e-02,
        -1.02152414e-01,  1.25301724e-01,  8.10065862e-02,  1.21732345e-01,
        -1.05193793e-02,  2.48276414e-02,  1.33180745e-01, -9.80706369e-02,
  

In [10]:
out_dict = {key: value.tolist() for key, value in out_dict.items()}

In [None]:
out_dict

{28230844: [-0.2633722413793103,
  0.025410068965517237,
  0.05159844827586208,
  -0.10153531034482759,
  0.12943048275862068,
  0.11983901724137935,
  -0.11669815172413793,
  -0.017826172413793095,
  0.037570775862068966,
  -1.471932379310345,
  0.1252438827586207,
  0.11057119310344829,
  -0.027279689655172423,
  0.04165391724137932,
  0.2022807931034483,
  -0.09502551724137931,
  -0.10070379999999998,
  -0.008811103448275855,
  -0.08290482758620692,
  0.0875359655172414,
  -0.04384037241379311,
  0.12929808965517242,
  0.02233793103448276,
  0.1655571379310345,
  0.03396137931034481,
  0.0896747068965517,
  0.09283086206896553,
  0.03386751379310344,
  -0.03472924137931033,
  0.0656852172413793,
  0.11750527586206896,
  0.2658301,
  -0.17103234482758622,
  0.1665588655172414,
  -0.14773926551724134,
  0.22433317241379316,
  0.12380827586206898,
  0.028652330689655173,
  -0.05205784137931034,
  0.03138624137931035,
  -0.055139137931034476,
  0.09371133172413791,
  0.03658579310344828

In [None]:
df = pd.read_csv("F://IR/Data1.csv")
df

Unnamed: 0.1,Unnamed: 0,Title,Authors,Abstract,PMID,Journal,Year
0,0,Network neuroscience.,"Bassett DS, Sporns O.",Two parallel trends drive the approach: the av...,28230844,Nat Neurosci.,2017
1,1,The network science of collective intelligence.,Centola D.,"In the last few years, breakthroughs in comput...",36180361,Trends Cogn Sci.,2022
2,2,The Role of Network Science in Glioblastoma.,"Lopes MB, Martins EP, Vinga S, Costa BM.",Network science has long been recognized as a ...,33801334,Cancers (Basel).,2021
3,3,Cognitive Network Science for Understanding On...,Stella M.,Since social media data come from users' minds...,34118113,Top Cogn Sci.,2022
4,4,Network science.,Barabási AL.,Professor Barabasi's talk described how the to...,23419844,Philos Trans A Math Phys Eng Sci.,2013
...,...,...,...,...,...,...,...
95,114,Advances in metabolome information retrieval: ...,"Tebani A, Afonso C, Bekri S.",It particularly deals with inherent advantages...,28842777,J Inherit Metab Dis.,2018
96,116,Information retrieval in medicine: state of th...,"Hersh WR, Greenes RA.",Conventional information retrieval systems usu...,2243546,MD Comput.,1990
97,117,"Ethics, genomics, and information retrieval.",Goodman KW.,The union of genomics and computational inform...,8725773,Comput Biol Med.,1996
98,118,Information retrieval in digital libraries: br...,Schatz BR.,A digital library enables users to interact ef...,8994022,Science.,1997


In [11]:
df.set_index('PMID', inplace=True)
df['Values'] = df.index.map(out_dict)
df.to_csv("F://IR/Data/EMbedded_Data.csv")

In [None]:
df = pd.read_csv("F://IR/Data2.csv")
df['Values'] = df['Values'].apply(np.array)

In [None]:
df

Unnamed: 0.1,PMID,Unnamed: 0,Title,Authors,Abstract,Journal,Year,Values
0,28230844,0,Network neuroscience.,"Bassett DS, Sporns O.",Two parallel trends drive the approach: the av...,Nat Neurosci.,2017,"[-0.2633722413793103, 0.025410068965517237, 0...."
1,36180361,1,The network science of collective intelligence.,Centola D.,"In the last few years, breakthroughs in comput...",Trends Cogn Sci.,2022,"[-0.032688040000000015, 0.15937595999999996, 0..."
2,33801334,2,The Role of Network Science in Glioblastoma.,"Lopes MB, Martins EP, Vinga S, Costa BM.",Network science has long been recognized as a ...,Cancers (Basel).,2021,"[-0.16727454545454548, 0.10191690909090907, 0...."
3,34118113,3,Cognitive Network Science for Understanding On...,Stella M.,Since social media data come from users' minds...,Top Cogn Sci.,2022,"[-0.24739657142857144, 0.02699521785714287, 0...."
4,23419844,4,Network science.,Barabási AL.,Professor Barabasi's talk described how the to...,Philos Trans A Math Phys Eng Sci.,2013,"[-0.17991345, 0.11808837500000001, 0.051023541..."
...,...,...,...,...,...,...,...,...
95,28842777,114,Advances in metabolome information retrieval: ...,"Tebani A, Afonso C, Bekri S.",It particularly deals with inherent advantages...,J Inherit Metab Dis.,2018,"[-0.177324892, 0.061545816, 0.1370696400000000..."
96,2243546,116,Information retrieval in medicine: state of th...,"Hersh WR, Greenes RA.",Conventional information retrieval systems usu...,MD Comput.,1990,"[-0.3326925454545455, -0.102388, 0.15886195454..."
97,8725773,117,"Ethics, genomics, and information retrieval.",Goodman KW.,The union of genomics and computational inform...,Comput Biol Med.,1996,"[-0.2806944736842105, 0.052907231578947354, 0...."
98,8994022,118,Information retrieval in digital libraries: br...,Schatz BR.,A digital library enables users to interact ef...,Science.,1997,"[-0.34927260000000004, 0.07640575999999999, 0...."


In [None]:
def get_sim(query_embedding, average_vector_doc):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding, 
    average_vector_doc))]
    return sim

In [None]:
def Ranked_documents(query):
    query_word_vectors = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))
    rank = []
    for k,v in out_dict.items():
        rank.append((k, get_sim(query_word_vectors, v)))
        rank = sorted(rank,key=lambda t: t[1], reverse=True)
    return rank

In [None]:
import ast
df = pd.read_csv("F://IR/Data2.csv")
df['Values'] = df['Values'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df.set_index('PMID', inplace=True)

In [None]:
out_dict = {}
for index, row in df.iterrows():
    PMID = index
    values = row['Values']
    out_dict[PMID] = values

In [None]:
out_dict

{28230844: [-0.2633722413793103,
  0.025410068965517237,
  0.05159844827586208,
  -0.10153531034482759,
  0.12943048275862068,
  0.11983901724137935,
  -0.11669815172413793,
  -0.017826172413793095,
  0.037570775862068966,
  -1.471932379310345,
  0.1252438827586207,
  0.11057119310344829,
  -0.027279689655172423,
  0.04165391724137932,
  0.2022807931034483,
  -0.09502551724137931,
  -0.10070379999999998,
  -0.008811103448275855,
  -0.08290482758620692,
  0.0875359655172414,
  -0.04384037241379311,
  0.12929808965517242,
  0.02233793103448276,
  0.1655571379310345,
  0.03396137931034481,
  0.0896747068965517,
  0.09283086206896553,
  0.03386751379310344,
  -0.03472924137931033,
  0.0656852172413793,
  0.11750527586206896,
  0.2658301,
  -0.17103234482758622,
  0.1665588655172414,
  -0.14773926551724134,
  0.22433317241379316,
  0.12380827586206898,
  0.028652330689655173,
  -0.05205784137931034,
  0.03138624137931035,
  -0.055139137931034476,
  0.09371133172413791,
  0.03658579310344828

In [None]:
ranks = Ranked_documents("machine learning")

In [None]:
ranks

[(32011262, [0.7284841430005419]),
 (32800297, [0.7272496103673078]),
 (32704420, [0.7271385021321072]),
 (34537858, [0.7157824820475406]),
 (29194052, [0.7124171039210307]),
 (31818379, [0.7029761563074565]),
 (32245523, [0.7005206045917988]),
 (34338485, [0.6948627317284952]),
 (35924105, [0.6801225351403792]),
 (30939301, [0.6699874456711173]),
 (31601480, [0.6643915059991786]),
 (33290932, [0.6573880958180807]),
 (35830864, [0.6557381827038056]),
 (29278737, [0.6490958139545022]),
 (26572668, [0.6415407470268787]),
 (36069404, [0.6324581184033297]),
 (34392886, [0.6244041955756803]),
 (34518686, [0.6116878473808032]),
 (32645448, [0.6071375919774414]),
 (32449232, [0.60453025805439]),
 (37612141, [0.5970054842484803]),
 (28230844, [0.5859940251277622]),
 (12222090, [0.5815598549238582]),
 (35731335, [0.5776070338785501]),
 (25186238, [0.5768714119876043]),
 (9099122, [0.5715011746587141]),
 (34118113, [0.5698537681905262]),
 (36203727, [0.5690253391465109]),
 (30102808, [0.56101223

In [None]:
for i in ranks:
    print(df.loc[i[0]]['Title'])


A Review on Machine Learning for EEG Signal Processing in Bioengineering.
Supervised Machine Learning: A Brief Primer.
Introduction to Machine Learning, Neural Networks, and Deep Learning.
Radiomics, machine learning, and artificial intelligence-what the neuroradiologist needs to know.
Machine learning in heart failure: ready for prime time.
Machine Learning Principles for Radiology Investigators.
Machine learning and clinical epigenetics: a review of challenges for diagnosis and classification.
Machine learning for cardiology.
3D Convolutional Neural Network Framework with Deep Learning for Nuclear Medicine.
Neural network models and deep learning.
Machine Learning and Deep Learning in Medical Imaging: Intelligent Imaging.
Machine learning model for predicting malaria using clinical information.
Machine-designed biotherapeutics: opportunities, feasibility and advantages of deep learning in computational antibody discovery.
Network science in clinical trials: A patient-centered approac