In [1]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
import re
import math
from IPython.display import display

#Importing Pubmed Files
file_list = os.listdir(r"pmc_articles\train_files")
file_list[:10]

['PMC10491276.xml',
 'PMC10705249.xml',
 'PMC10705266.xml',
 'PMC10705274.xml',
 'PMC10705278.xml',
 'PMC10705288.xml',
 'PMC10705364.xml',
 'PMC10705395.xml',
 'PMC10705415.xml',
 'PMC10705441.xml']

In [2]:
def clean_doc(doc):
    #Converting doc to lowercase
    doc = doc.lower()
    
    doc = re.sub(r"\W+|_|\\|/"," ",doc)
    #Removing punctuation
    # doc = "".join([char for char in doc if char not in string.punctuation])
    
    #Creating list of english stopwords
    stop_words = set(stopwords.words('english'))
    
    #Tokenizing the docs into words
    word_tokens = word_tokenize(doc)
    
    #Removing stopwords
    filtered_doc = [w for w in word_tokens if not w in stop_words]
    
    #Joining all words into a string with spaces
    cleaned_doc = ' '.join(filtered_doc)

    #Return cleaned document
    return cleaned_doc
    

In [3]:
# Creating a dictionary output for the XML files to be transformed
dict_output = {"pmcid": [], "journal": [], "title": [], "authors": [], "abstract": [], "body": []}

# Parsing through the XML files using BeautifulSoup to check the tags which have text information stored
for f in file_list:
    with open(r"pmc_articles\train_files\\" + f, "r") as f_in:
        soup = BeautifulSoup(f_in.read(), features = "xml")
        try:
            journal = soup.find("journal-title").get_text()
        except:
            continue
        article = soup.find("article")
        try:
            title = article.find("article-title").get_text().replace("\n", " ")
        except:
             continue

        article_id = article.find_all("article-id")
        pmcid = ""
        for aid in article_id:
            try:
                if aid["pub-id-type"] == "pmc":
                    pmcid = aid.get_text()
            except:
                continue
        
        authors = []
        contrib = article.find("contrib-group")
        if contrib:
            for con in contrib.find_all("name"):
                try:
                    name = con.find("given-names").get_text().replace("\n", " ")
                except:
                    name = ""
                try:
                    surname = con.find("surname").get_text().replace("\n", " ")
                except:
                    surname = ""
                authors.append(name + " " + surname)
        try:
            abstract = article.find("abstract")
            if (abstract.find("p")) is not None:
                abstract = article.find("abstract")
                abstract_list = abstract.find_all("p")
                abstract_text = " ".join([paragraph.get_text().replace("\n", " ") for paragraph in abstract_list])
            else:
                abstract = article.find("abstract").get_text().replace("\n", " ")
        except:
            continue
        
        body_text = ""
        body = article.find("body")
        sec_title = " "
        counter = 1
        
        if body:
                paragraphs = body.find_all("p")
                sec_text = " ".join([paragraph.get_text().replace("\n", " ") for paragraph in paragraphs])
                body_text += sec_text + " "

        dict_output["pmcid"].append(pmcid)
        dict_output["journal"].append(journal)
        dict_output["title"].append(title)
        dict_output["authors"].append(", ".join(authors))
        dict_output["abstract"].append(abstract_text)
        dict_output["body"].append(body_text)

# Storing the output to a dataframe

df = pd.DataFrame.from_dict(dict_output, orient= "index").transpose().sort_values(by = "journal", ignore_index = True)
df

Unnamed: 0,pmcid,journal,title,authors,abstract,body
0,PMC10732281,ACS Sustainable Chemistry & Engineering,Upcycling of Poly(lactic acid) Waste: A Valuab...,"Giovanna Raia, Salvatore Marullo, Giuseppe Laz...",With the aim to investigate new strategies fo...,Plastic polymers represent a class of material...
1,PMC10732696,APL Bioengineering,Altered blood flow due to larger aortic diamet...,"Silje Ekroll Jahren, Caglayan Demirel, Karolin...",The etiology of transcatheter heart valve thro...,The incidence of clinically apparent transcath...
2,PMC524183,Annals of Clinical Microbiology and Antimicrob...,Emphysematous cystitis: An unusual disease of ...,"Ravi K Bobba, Edward L Arsura, Pawanjit S Sarn...",Emphysematous cystitis is a rare disease entit...,"Emphysematous cystitis is an uncommon, but sev..."
3,PMC524182,Annals of Clinical Microbiology and Antimicrob...,Utilization of a ts-sacB selection system for ...,"Vida R Irani, Sun-Hwa Lee, Torsten M Eckstein,...",Mycobacterium avium are ubiquitous environment...,Mycobacterium avium is a frequent cause of dis...
4,PMC544836,Annals of Clinical Microbiology and Antimicrob...,In vitro activity of antiamoebic drugs against...,"Devendra Bansal, Rakesh Sehgal, Yogesh Chawla,...",Amoebiasis is a major public health problem in...,"Entamoeba histolytica, is the etiological agen..."
...,...,...,...,...,...,...
995,PMC10705266,bioRxiv,Tertiary folds of the SL5 RNA from the 5′ prox...,"Rachael C. Kretsch, Lily Xu, Ivan N. Zheludev,...",Coronavirus genomes sequester their start codo...,"In the Coronaviridae family, only seven specie..."
996,PMC10733102,eBioMedicine,Direct therapeutic effect of sulfadoxine-pyrim...,"Seongmin Kim, Arash Naziripour, Pranav Prabhal...",Sulfadoxine-pyrimethamine (SP) antimalarial th...,Research in contextEvidence before this study...
997,PMC10732572,eLife,The neuronal calcium sensor NCS-1 regulates th...,"Daniel Muñoz-Reyes, Levi J McClelland, Sandra ...","The neuronal calcium sensor 1 (NCS-1), an EF-h...",Ca2+ is a key signal that regulates multiple b...
998,PMC10732571,eLife,Regulation of pDC fate determination by histon...,"Yijun Zhang, Tao Wu, Zhimin He, Wenlong Lai, X...","Dendritic cells (DCs), the key antigen-present...",Dendritic cells (DCs) are essential regulators...


In [4]:
# Drop any NA rows
df.replace(to_replace='',value=None, inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,pmcid,journal,title,authors,abstract,body
0,PMC10732281,ACS Sustainable Chemistry & Engineering,Upcycling of Poly(lactic acid) Waste: A Valuab...,"Giovanna Raia, Salvatore Marullo, Giuseppe Laz...",With the aim to investigate new strategies fo...,Plastic polymers represent a class of material...
1,PMC10732696,APL Bioengineering,Altered blood flow due to larger aortic diamet...,"Silje Ekroll Jahren, Caglayan Demirel, Karolin...",The etiology of transcatheter heart valve thro...,The incidence of clinically apparent transcath...
2,PMC524183,Annals of Clinical Microbiology and Antimicrob...,Emphysematous cystitis: An unusual disease of ...,"Ravi K Bobba, Edward L Arsura, Pawanjit S Sarn...",Emphysematous cystitis is a rare disease entit...,"Emphysematous cystitis is an uncommon, but sev..."
3,PMC524182,Annals of Clinical Microbiology and Antimicrob...,Utilization of a ts-sacB selection system for ...,"Vida R Irani, Sun-Hwa Lee, Torsten M Eckstein,...",Mycobacterium avium are ubiquitous environment...,Mycobacterium avium is a frequent cause of dis...
4,PMC544836,Annals of Clinical Microbiology and Antimicrob...,In vitro activity of antiamoebic drugs against...,"Devendra Bansal, Rakesh Sehgal, Yogesh Chawla,...",Amoebiasis is a major public health problem in...,"Entamoeba histolytica, is the etiological agen..."
...,...,...,...,...,...,...
995,PMC10705266,bioRxiv,Tertiary folds of the SL5 RNA from the 5′ prox...,"Rachael C. Kretsch, Lily Xu, Ivan N. Zheludev,...",Coronavirus genomes sequester their start codo...,"In the Coronaviridae family, only seven specie..."
996,PMC10733102,eBioMedicine,Direct therapeutic effect of sulfadoxine-pyrim...,"Seongmin Kim, Arash Naziripour, Pranav Prabhal...",Sulfadoxine-pyrimethamine (SP) antimalarial th...,Research in contextEvidence before this study...
997,PMC10732572,eLife,The neuronal calcium sensor NCS-1 regulates th...,"Daniel Muñoz-Reyes, Levi J McClelland, Sandra ...","The neuronal calcium sensor 1 (NCS-1), an EF-h...",Ca2+ is a key signal that regulates multiple b...
998,PMC10732571,eLife,Regulation of pDC fate determination by histon...,"Yijun Zhang, Tao Wu, Zhimin He, Wenlong Lai, X...","Dendritic cells (DCs), the key antigen-present...",Dendritic cells (DCs) are essential regulators...


In [53]:
# Importing the test list of PMC articles
test_list = os.listdir(r"pmc_articles\test_files")
test_list

['PMC10895288.xml',
 'PMC10895289.xml',
 'PMC10895290.xml',
 'PMC10926249.xml',
 'PMC10926250.xml',
 'PMC10926251.xml',
 'PMC10994988.xml',
 'PMC10995047.xml',
 'PMC10995050.xml']

In [54]:
import langid
import pandas as pd
from bs4 import BeautifulSoup

# Similart to the train data, we are creating a dictionary output for the XML files to be transformed
test_output = {"pmcid": [], "journal": [], "title": [], "authors": [], "abstract": [], "body": [], "language": []}

# Parsing through the XML files using BeautifulSoup to check the tags which have text information stored
for f in test_list:
  with open(r"pmc_articles\test_files\\" + f, "r") as f_in:
    soup = BeautifulSoup(f_in.read(), features="xml")
    try:
      journal = soup.find("journal-title").get_text()
    except:
      continue
    article = soup.find("article")
    try:
      title = article.find("article-title").get_text().replace("\n", " ")
    except:
      continue

    article_id = article.find_all("article-id")
    pmcid = ""
    for aid in article_id:
      try:
        if aid["pub-id-type"] == "pmc":
          pmcid = aid.get_text()
      except:
        continue
    
    authors = []
    contrib = article.find("contrib-group")
    if contrib:
      for con in contrib.find_all("name"):
        try:
          name = con.find("given-names").get_text().replace("\n", " ")
        except:
          name = ""
        try:
          surname = con.find("surname").get_text().replace("\n", " ")
        except:
          surname = ""
        authors.append(name + " " + surname)
    try:
      abstract = article.find("abstract")
      if abstract.find("p") is not None:
        abstract_list = abstract.find_all("p")
        abstract_text = " ".join([paragraph.get_text().replace("\n", " ") for paragraph in abstract_list])
      else:
        abstract_text = article.find("abstract").get_text().replace("\n", " ")
    except:
      continue
    
    body_text = ""
    body = article.find("body")
    sec_title = " "
    counter = 1
    
    if body:
        
        # for sec in body.find_all("p"):
            # try:
            #   sec_title = sec.find("title").get_text().replace("\n", " ")
            #   sec_title_gen = sec_title
            #   counter = 1
            # except:
            #   sec_title_gen = sec_title + " " + str(counter)
            #   counter += 1
            paragraphs = body.find_all("p")
            sec_text = " ".join([paragraph.get_text().replace("\n", " ") for paragraph in paragraphs])
            body_text += sec_text + " "
    
    # Use TextBlob for language detection
    language = langid.classify(body_text)[0] if body_text else None

    test_output["pmcid"].append(pmcid)
    test_output["journal"].append(journal)
    test_output["title"].append(title)
    test_output["authors"].append(", ".join(authors))
    test_output["abstract"].append(abstract_text)
    test_output["body"].append(body_text)
    test_output["language"].append(language)

#Storing them in a test dataframe
test_df = pd.DataFrame.from_dict(test_output, orient= "index").transpose().sort_values(by = "pmcid", ignore_index = True)
test_df

Unnamed: 0,pmcid,journal,title,authors,abstract,body,language
0,PMC10895288,Chinese Journal of Lung Cancer,BRAF突变在非小细胞肺癌中的研究进展,"Libian DENG, Yaxian YANG, Jian HUANG",鼠类肉瘤病毒癌基因同源物（V-Raf murine sarcoma viral oncoge...,肺癌是导致癌症相关死亡的主要原因，占全球癌症死亡人数的1/3左右。在过去大约30年间，我国肺...,zh
1,PMC10895289,Chinese Journal of Lung Cancer,粒细胞样髓源性抑制细胞在非小细胞肺癌中的研究进展,"Chaodan YANG, Rui ZHU, Yuting ZHANG, Lisha YIN...",粒细胞样髓源性抑制细胞（granulocytic myeloid-derived suppr...,肺癌是一种起源于肺部气管、支气管黏膜或腺体的恶性肿瘤。美国临床肿瘤杂志发布的2020全球癌症...,zh
2,PMC10895290,Chinese Journal of Lung Cancer,21色流式检测人非小细胞肺癌组织中免疫细胞亚群方案的建立,"Tingting GUO, Hongguan XIE",背景与目的 肺癌组织的免疫微环境已成为关注的重点，随着多色流式的兴起，流式检测肺癌免疫微环境...,肺癌是全球发病率极高的癌症之一，也是癌症相关死亡的主要原因，每年约2万新发病例和1.76万死...,zh
3,PMC10926249,Problems of Endocrinology,Эктопическая акромегалия вследствие нейроэндок...,"Е. О. Мамедова, Е. Г. Пржиялковская, С. А. Бур...","Акромегалия — нейроэндокринное заболевание, во...",Акромегалия — тяжелое хроническое нейроэндокри...,ru
4,PMC10926250,Problems of Endocrinology,Фиксированные комбинации агонистов рецепторов ...,"Д. В. Куркин, Д. А. Бакулин, Е. И. Морковин, А...",Прогрессирующее течение сахарного диабета 2 ти...,Естественное течение сахарного диабета 2 типа ...,ru
5,PMC10926251,Problems of Endocrinology,Сравнительный анализ костных осложнений при МЭ...,"С. В. Пылина, А. К. Еремкина, А. Р. Елфимова, ...",ОБОСНОВАНИЕ. Синдром множественной эндокринной...,Синдром множественных эндокринных неоплазий 1 ...,ru
6,PMC10994988,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Wie sichern wir in Zukunft die Versorgung von ...,"Marcel Romanos, Gundolf Berg, Annegret Brauer,...",Kinder und Jugendliche mit psychischen Störung...,Psychische Erkrankungen beginnen früh in Kindh...,de
7,PMC10995047,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Substanzkonsum und Nutzung von sozialen Medien...,"Kirsten Lochbühler, Monika Rossa, Christopher ...",Das Ziel der vorliegenden Arbeit war die Erfas...,Die Begrenzung von Schäden durch Substanzkonsu...,de
8,PMC10995050,"Bundesgesundheitsblatt, Gesundheitsforschung, ...","Rechtliche, fachliche und strukturelle Aspekte...","Timo D. Vloet, Julia Geißler, Regina Taurines,...",In den letzten Jahren ist der Anteil der Notfa...,Der Suizid ist – nach Unfällen – die zweithäuf...,de


In [277]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# Load pre-trained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
device = "cpu"

def create_vector(text):
    # Encode the text using tokenizer
    text = clean_doc(text)
    tokenized_text = tokenizer(text, return_tensors='pt', max_length=512)
    input_ids = tokenized_text['input_ids'].to(device)
    attention_mask = tokenized_text['attention_mask'].to(device)

    # Split the input text into smaller segments if it exceeds 512 tokens
    segment_length = 512
    num_segments = (len(input_ids[0]) - 1) // segment_length + 1

    segment_embeddings = []
    for i in range(num_segments):
        start_idx = i * segment_length
        end_idx = min((i + 1) * segment_length, len(input_ids[0]))

        segment_input_ids = input_ids[:, start_idx:end_idx]
        segment_attention_mask = attention_mask[:, start_idx:end_idx]

        # Compute embeddings for the segment
        with torch.no_grad():
            outputs = model(segment_input_ids, attention_mask=segment_attention_mask)

        # Extract hidden states for the [CLS] token
        cls_embedding = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        segment_embeddings.append(cls_embedding)

    # Concatenate embeddings from all segments
    concatenated_embeddings = np.concatenate(segment_embeddings, axis=0)

    return concatenated_embeddings, max_length=512, padding='max_length', truncation=True)
    input_ids = tokenized_text['input_ids'].to(device)
    attention_mask = tokenized_text['attention_mask'].to(device)

    # Compute embeddings for the text
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Extract hidden states for the [CLS] token
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

    return cls_embedding


In [278]:
%%time
from transformers import MarianTokenizer, MarianMTModel

# Load MarianMT model and tokenizer for translating to English
def translate_to_english(row):
    # Load MarianMT model and tokenizer for translating to English
    lang = row['language']
    model_name = 'Helsinki-NLP/opus-mt-'+lang+'-en'  
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    combined_text = row['title'] + ' ' + row['abstract'] + ' ' + row['body']
    
    # Tokenize the combined_text into chunks of maximum length 512
    token_chunks = []
    for i in range(0, len(combined_text), 512):
        chunk = combined_text[i:i+512]
        token_chunks.append(chunk)
    
    # Generate translations for each token chunk
    translated_chunks = []
    for chunk in token_chunks:
        # Tokenize the chunk
        inputs = tokenizer(chunk, return_tensors='pt')
        
        # Generate translation
        translated = model.generate(**inputs)
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        translated_chunks.append(translated_text)
    
    # Combine translated chunks into a single translated text
    translated_text = ' '.join(translated_chunks)
    translated_text = clean_doc(translated_text)
    return translated_text


# Apply the translation function to each row of the dataframe and store the results in a new column
test_df['english_articles'] = test_df.apply(translate_to_english, axis=1)

test_df

CPU times: total: 4min 26s
Wall time: 1min 15s


Unnamed: 0,pmcid,journal,title,authors,abstract,body,language,english_articles,vectors,similar_article_id,similarity_score
0,PMC10895288,Chinese Journal of Lung Cancer,BRAF突变在非小细胞肺癌中的研究进展,"Libian DENG, Yaxian YANG, Jian HUANG",鼠类肉瘤病毒癌基因同源物（V-Raf murine sarcoma viral oncoge...,肺癌是导致癌症相关死亡的主要原因，占全球癌症死亡人数的1/3左右。在过去大约30年间，我国肺...,zh,BRAF mutagenic research in non-small cell lung...,"[[-0.3473824, 0.1278285, 0.023857947, -0.16154...",PMC514533,0.148161
1,PMC10895289,Chinese Journal of Lung Cancer,粒细胞样髓源性抑制细胞在非小细胞肺癌中的研究进展,"Chaodan YANG, Rui ZHU, Yuting ZHANG, Lisha YIN...",粒细胞样髓源性抑制细胞（granulocytic myeloid-derived suppr...,肺癌是一种起源于肺部气管、支气管黏膜或腺体的恶性肿瘤。美国临床肿瘤杂志发布的2020全球癌症...,zh,Research progress in the non-small cell lung c...,"[[-0.33013338, 0.13848288, -0.015225947, -0.14...",PMC544962,0.132682
2,PMC10895290,Chinese Journal of Lung Cancer,21色流式检测人非小细胞肺癌组织中免疫细胞亚群方案的建立,"Tingting GUO, Hongguan XIE",背景与目的 肺癌组织的免疫微环境已成为关注的重点，随着多色流式的兴起，流式检测肺癌免疫微环境...,肺癌是全球发病率极高的癌症之一，也是癌症相关死亡的主要原因，每年约2万新发病例和1.76万死...,zh,The research established a reliable 21-colour ...,"[[-0.4499666, -0.12148322, -0.06703161, -0.267...",PMC514533,0.132426
3,PMC10926249,Problems of Endocrinology,Эктопическая акромегалия вследствие нейроэндок...,"Е. О. Мамедова, Е. Г. Пржиялковская, С. А. Бур...","Акромегалия — нейроэндокринное заболевание, во...",Акромегалия — тяжелое хроническое нейроэндокри...,ru,The first description of three clinical cases ...,"[[-0.4695993, 0.111304544, -0.17800906, -0.228...",PMC514533,0.158091
4,PMC10926250,Problems of Endocrinology,Фиксированные комбинации агонистов рецепторов ...,"Д. В. Куркин, Д. А. Бакулин, Е. И. Морковин, А...",Прогрессирующее течение сахарного диабета 2 ти...,Естественное течение сахарного диабета 2 типа ...,ru,Fixed combinations of GPP-1 receptor agonists ...,"[[-0.3642939, 0.11431462, 0.024415087, -0.0946...",PMC514533,0.115393
5,PMC10926251,Problems of Endocrinology,Сравнительный анализ костных осложнений при МЭ...,"С. В. Пылина, А. К. Еремкина, А. Р. Елфимова, ...",ОБОСНОВАНИЕ. Синдром множественной эндокринной...,Синдром множественных эндокринных неоплазий 1 ...,ru,Comparative analysis of bone complications in ...,"[[-0.42153862, 0.08043476, -0.14690945, -0.096...",PMC538275,0.132226
6,PMC10994988,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Wie sichern wir in Zukunft die Versorgung von ...,"Marcel Romanos, Gundolf Berg, Annegret Brauer,...",Kinder und Jugendliche mit psychischen Störung...,Psychische Erkrankungen beginnen früh in Kindh...,de,"In the future, how do we ensure that children ...","[[-0.38105622, 0.15715934, 0.1829635, 0.099352...",PMC535542,0.150157
7,PMC10995047,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Substanzkonsum und Nutzung von sozialen Medien...,"Kirsten Lochbühler, Monika Rossa, Christopher ...",Das Ziel der vorliegenden Arbeit war die Erfas...,Die Begrenzung von Schäden durch Substanzkonsu...,de,"Substance consumption and use of social media,...","[[-0.22348407, 0.33980572, 0.01982236, -0.0741...",PMC535542,0.141834
8,PMC10995050,"Bundesgesundheitsblatt, Gesundheitsforschung, ...","Rechtliche, fachliche und strukturelle Aspekte...","Timo D. Vloet, Julia Geißler, Regina Taurines,...",In den letzten Jahren ist der Anteil der Notfa...,Der Suizid ist – nach Unfällen – die zweithäuf...,de,"Legal, technical and structural aspects of mod...","[[-0.37096193, 0.1889037, 0.019168876, -0.1355...",PMC535542,0.150699


In [279]:
%%time
# Define the function to convert text into vectors
def text_to_vector(row):
    combined_text = row['title'] + ' ' + row['abstract'] + ' ' + row['body']
    vector = create_vector(combined_text)
    return vector
    
# Apply the function to each row of the dataframe and store the results in a new column
df['vectors'] = df.apply(text_to_vector, axis=1)

df

CPU times: total: 33min 33s
Wall time: 8min 28s


Unnamed: 0,pmcid,journal,title,authors,abstract,body,vectors
0,PMC10732281,ACS Sustainable Chemistry & Engineering,Upcycling of Poly(lactic acid) Waste: A Valuab...,"Giovanna Raia, Salvatore Marullo, Giuseppe Laz...",With the aim to investigate new strategies fo...,Plastic polymers represent a class of material...,"[[-0.43718785, 0.071802214, -0.05778627, -0.02..."
1,PMC10732696,APL Bioengineering,Altered blood flow due to larger aortic diamet...,"Silje Ekroll Jahren, Caglayan Demirel, Karolin...",The etiology of transcatheter heart valve thro...,The incidence of clinically apparent transcath...,"[[-0.5482247, -0.033123694, -0.019976169, -0.0..."
2,PMC524183,Annals of Clinical Microbiology and Antimicrob...,Emphysematous cystitis: An unusual disease of ...,"Ravi K Bobba, Edward L Arsura, Pawanjit S Sarn...",Emphysematous cystitis is a rare disease entit...,"Emphysematous cystitis is an uncommon, but sev...","[[-0.50195366, 0.09536925, -0.12502493, -0.130..."
3,PMC524182,Annals of Clinical Microbiology and Antimicrob...,Utilization of a ts-sacB selection system for ...,"Vida R Irani, Sun-Hwa Lee, Torsten M Eckstein,...",Mycobacterium avium are ubiquitous environment...,Mycobacterium avium is a frequent cause of dis...,"[[-0.34958497, 0.18755327, -0.053207412, -0.13..."
4,PMC544836,Annals of Clinical Microbiology and Antimicrob...,In vitro activity of antiamoebic drugs against...,"Devendra Bansal, Rakesh Sehgal, Yogesh Chawla,...",Amoebiasis is a major public health problem in...,"Entamoeba histolytica, is the etiological agen...","[[-0.34221965, 0.11730466, -0.067334265, -0.02..."
...,...,...,...,...,...,...,...
995,PMC10705266,bioRxiv,Tertiary folds of the SL5 RNA from the 5′ prox...,"Rachael C. Kretsch, Lily Xu, Ivan N. Zheludev,...",Coronavirus genomes sequester their start codo...,"In the Coronaviridae family, only seven specie...","[[-0.5046941, 0.016345115, -0.2002834, -0.1471..."
996,PMC10733102,eBioMedicine,Direct therapeutic effect of sulfadoxine-pyrim...,"Seongmin Kim, Arash Naziripour, Pranav Prabhal...",Sulfadoxine-pyrimethamine (SP) antimalarial th...,Research in contextEvidence before this study...,"[[-0.39765096, 0.1399015, -0.07467778, -0.0701..."
997,PMC10732572,eLife,The neuronal calcium sensor NCS-1 regulates th...,"Daniel Muñoz-Reyes, Levi J McClelland, Sandra ...","The neuronal calcium sensor 1 (NCS-1), an EF-h...",Ca2+ is a key signal that regulates multiple b...,"[[-0.5359607, 0.13719627, 0.08093366, -0.11788..."
998,PMC10732571,eLife,Regulation of pDC fate determination by histon...,"Yijun Zhang, Tao Wu, Zhimin He, Wenlong Lai, X...","Dendritic cells (DCs), the key antigen-present...",Dendritic cells (DCs) are essential regulators...,"[[-0.400486, 0.17068304, -0.1827298, -0.057402..."


In [280]:
%%time
# Define the function to convert translated text into vectors
def translated_text_to_vector(row):
    vector = create_vector(row['english_articles'])
    return vector

# Apply the function to each row of the dataframe and store the results in a new column
test_df['vectors'] = test_df.apply(translated_text_to_vector, axis=1)

test_df

CPU times: total: 2.34 s
Wall time: 638 ms


Unnamed: 0,pmcid,journal,title,authors,abstract,body,language,english_articles,vectors,similar_article_id,similarity_score
0,PMC10895288,Chinese Journal of Lung Cancer,BRAF突变在非小细胞肺癌中的研究进展,"Libian DENG, Yaxian YANG, Jian HUANG",鼠类肉瘤病毒癌基因同源物（V-Raf murine sarcoma viral oncoge...,肺癌是导致癌症相关死亡的主要原因，占全球癌症死亡人数的1/3左右。在过去大约30年间，我国肺...,zh,BRAF mutagenic research in non-small cell lung...,"[[-0.35862926, 0.053985305, -0.14855488, -0.11...",PMC514533,0.148161
1,PMC10895289,Chinese Journal of Lung Cancer,粒细胞样髓源性抑制细胞在非小细胞肺癌中的研究进展,"Chaodan YANG, Rui ZHU, Yuting ZHANG, Lisha YIN...",粒细胞样髓源性抑制细胞（granulocytic myeloid-derived suppr...,肺癌是一种起源于肺部气管、支气管黏膜或腺体的恶性肿瘤。美国临床肿瘤杂志发布的2020全球癌症...,zh,Research progress in the non-small cell lung c...,"[[-0.433042, 0.024677152, -0.19129722, -0.0738...",PMC544962,0.132682
2,PMC10895290,Chinese Journal of Lung Cancer,21色流式检测人非小细胞肺癌组织中免疫细胞亚群方案的建立,"Tingting GUO, Hongguan XIE",背景与目的 肺癌组织的免疫微环境已成为关注的重点，随着多色流式的兴起，流式检测肺癌免疫微环境...,肺癌是全球发病率极高的癌症之一，也是癌症相关死亡的主要原因，每年约2万新发病例和1.76万死...,zh,The research established a reliable 21-colour ...,"[[-0.18076868, -0.0912368, -0.076669395, -0.09...",PMC514533,0.132426
3,PMC10926249,Problems of Endocrinology,Эктопическая акромегалия вследствие нейроэндок...,"Е. О. Мамедова, Е. Г. Пржиялковская, С. А. Бур...","Акромегалия — нейроэндокринное заболевание, во...",Акромегалия — тяжелое хроническое нейроэндокри...,ru,The first description of three clinical cases ...,"[[-0.35508454, 0.06880237, -0.23185648, -0.129...",PMC514533,0.158091
4,PMC10926250,Problems of Endocrinology,Фиксированные комбинации агонистов рецепторов ...,"Д. В. Куркин, Д. А. Бакулин, Е. И. Морковин, А...",Прогрессирующее течение сахарного диабета 2 ти...,Естественное течение сахарного диабета 2 типа ...,ru,Fixed combinations of GPP-1 receptor agonists ...,"[[-0.2642325, 0.10828398, 0.14684564, -0.06863...",PMC514533,0.115393
5,PMC10926251,Problems of Endocrinology,Сравнительный анализ костных осложнений при МЭ...,"С. В. Пылина, А. К. Еремкина, А. Р. Елфимова, ...",ОБОСНОВАНИЕ. Синдром множественной эндокринной...,Синдром множественных эндокринных неоплазий 1 ...,ru,Comparative analysis of bone complications in ...,"[[-0.3260308, -0.07061463, -0.26915473, 0.0364...",PMC538275,0.132226
6,PMC10994988,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Wie sichern wir in Zukunft die Versorgung von ...,"Marcel Romanos, Gundolf Berg, Annegret Brauer,...",Kinder und Jugendliche mit psychischen Störung...,Psychische Erkrankungen beginnen früh in Kindh...,de,"In the future, how do we ensure that children ...","[[-0.056195956, 0.1804344, -0.0030824766, 0.09...",PMC535542,0.150157
7,PMC10995047,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Substanzkonsum und Nutzung von sozialen Medien...,"Kirsten Lochbühler, Monika Rossa, Christopher ...",Das Ziel der vorliegenden Arbeit war die Erfas...,Die Begrenzung von Schäden durch Substanzkonsu...,de,"Substance consumption and use of social media,...","[[-0.14035097, 0.05508223, 0.056929976, 0.0854...",PMC535542,0.141834
8,PMC10995050,"Bundesgesundheitsblatt, Gesundheitsforschung, ...","Rechtliche, fachliche und strukturelle Aspekte...","Timo D. Vloet, Julia Geißler, Regina Taurines,...",In den letzten Jahren ist der Anteil der Notfa...,Der Suizid ist – nach Unfällen – die zweithäuf...,de,"Legal, technical and structural aspects of mod...","[[-0.10623726, 0.07956949, -0.02273462, -0.068...",PMC535542,0.150699


In [281]:
import pandas as pd
from scipy.spatial.distance import cosine

# Using scipy Cosine to calculate similarity
def similarity_test(row):
    most_similar_article_index = None
    highest_similarity_score = -(math.inf)
    
    # Get the translated vector
    translated_vector = row['vectors'].squeeze() 
    # translated_vector = translated_vector / np.linalg.norm(translated_vector)
    
    # Iterate over each row in the English DataFrame
    for english_index, english_row in df.iterrows():
        # Get the English language vector
        english_vector = english_row['vectors'].squeeze()  
        # english_vector = english_vector / np.linalg.norm(english_vector)
        
        # Compute cosine similarity between translated vector and English language vector
        similarity = cosine(translated_vector, english_vector)
        
        # Update most similar article if similarity score is higher
        if similarity > highest_similarity_score:
            highest_similarity_score = similarity
            most_similar_article_index = english_index
    
    # Get the most similar original English article
    most_similar_article_id = df.loc[most_similar_article_index, 'pmcid']

    return most_similar_article_id, highest_similarity_score

In [282]:
%%time
# Store similarity score and article ID in the test dataframe
test_df[['similar_article_id','similarity_score']]  = test_df.apply(similarity_test, axis=1, result_type='expand')
test_df

CPU times: total: 1.09 s
Wall time: 622 ms


Unnamed: 0,pmcid,journal,title,authors,abstract,body,language,english_articles,vectors,similar_article_id,similarity_score
0,PMC10895288,Chinese Journal of Lung Cancer,BRAF突变在非小细胞肺癌中的研究进展,"Libian DENG, Yaxian YANG, Jian HUANG",鼠类肉瘤病毒癌基因同源物（V-Raf murine sarcoma viral oncoge...,肺癌是导致癌症相关死亡的主要原因，占全球癌症死亡人数的1/3左右。在过去大约30年间，我国肺...,zh,BRAF mutagenic research in non-small cell lung...,"[[-0.35862926, 0.053985305, -0.14855488, -0.11...",PMC514533,0.196557
1,PMC10895289,Chinese Journal of Lung Cancer,粒细胞样髓源性抑制细胞在非小细胞肺癌中的研究进展,"Chaodan YANG, Rui ZHU, Yuting ZHANG, Lisha YIN...",粒细胞样髓源性抑制细胞（granulocytic myeloid-derived suppr...,肺癌是一种起源于肺部气管、支气管黏膜或腺体的恶性肿瘤。美国临床肿瘤杂志发布的2020全球癌症...,zh,Research progress in the non-small cell lung c...,"[[-0.433042, 0.024677152, -0.19129722, -0.0738...",PMC10732419,0.226183
2,PMC10895290,Chinese Journal of Lung Cancer,21色流式检测人非小细胞肺癌组织中免疫细胞亚群方案的建立,"Tingting GUO, Hongguan XIE",背景与目的 肺癌组织的免疫微环境已成为关注的重点，随着多色流式的兴起，流式检测肺癌免疫微环境...,肺癌是全球发病率极高的癌症之一，也是癌症相关死亡的主要原因，每年约2万新发病例和1.76万死...,zh,The research established a reliable 21-colour ...,"[[-0.18076868, -0.0912368, -0.076669395, -0.09...",PMC544960,0.172596
3,PMC10926249,Problems of Endocrinology,Эктопическая акромегалия вследствие нейроэндок...,"Е. О. Мамедова, Е. Г. Пржиялковская, С. А. Бур...","Акромегалия — нейроэндокринное заболевание, во...",Акромегалия — тяжелое хроническое нейроэндокри...,ru,The first description of three clinical cases ...,"[[-0.35508454, 0.06880237, -0.23185648, -0.129...",PMC544960,0.238816
4,PMC10926250,Problems of Endocrinology,Фиксированные комбинации агонистов рецепторов ...,"Д. В. Куркин, Д. А. Бакулин, Е. И. Морковин, А...",Прогрессирующее течение сахарного диабета 2 ти...,Естественное течение сахарного диабета 2 типа ...,ru,Fixed combinations of GPP-1 receptor agonists ...,"[[-0.2642325, 0.10828398, 0.14684564, -0.06863...",PMC514533,0.195528
5,PMC10926251,Problems of Endocrinology,Сравнительный анализ костных осложнений при МЭ...,"С. В. Пылина, А. К. Еремкина, А. Р. Елфимова, ...",ОБОСНОВАНИЕ. Синдром множественной эндокринной...,Синдром множественных эндокринных неоплазий 1 ...,ru,Comparative analysis of bone complications in ...,"[[-0.3260308, -0.07061463, -0.26915473, 0.0364...",PMC514533,0.205086
6,PMC10994988,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Wie sichern wir in Zukunft die Versorgung von ...,"Marcel Romanos, Gundolf Berg, Annegret Brauer,...",Kinder und Jugendliche mit psychischen Störung...,Psychische Erkrankungen beginnen früh in Kindh...,de,"In the future, how do we ensure that children ...","[[-0.056195956, 0.1804344, -0.0030824766, 0.09...",PMC535542,0.211153
7,PMC10995047,"Bundesgesundheitsblatt, Gesundheitsforschung, ...",Substanzkonsum und Nutzung von sozialen Medien...,"Kirsten Lochbühler, Monika Rossa, Christopher ...",Das Ziel der vorliegenden Arbeit war die Erfas...,Die Begrenzung von Schäden durch Substanzkonsu...,de,"Substance consumption and use of social media,...","[[-0.14035097, 0.05508223, 0.056929976, 0.0854...",PMC514533,0.180377
8,PMC10995050,"Bundesgesundheitsblatt, Gesundheitsforschung, ...","Rechtliche, fachliche und strukturelle Aspekte...","Timo D. Vloet, Julia Geißler, Regina Taurines,...",In den letzten Jahren ist der Anteil der Notfa...,Der Suizid ist – nach Unfällen – die zweithäuf...,de,"Legal, technical and structural aspects of mod...","[[-0.10623726, 0.07956949, -0.02273462, -0.068...",PMC535542,0.216127


**Conclusion**
- We can see that most texts have a similarity score of around 20, indicating that this level of similarity is expected in all articles
- While distilBERT is a good starter model, we can use Google AI's Transformer library or even roBERTa by HuggingFace to further improve the computational power of this project