In [1]:
import os
directory = '/home/ubuntu/mypetalibrary/pmoa-cite-dataset/extracted_files/1'
files = os.listdir(directory)
files.sort()
first_file = os.path.join(directory, files[0])

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Read PubMed Parquet Files") \
    .getOrCreate()
df = spark.read.parquet(first_file)
df.printSchema()
df.show()
# spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/05 08:08:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- pmid: string (nullable = true)
 |-- secid: long (nullable = true)
 |-- paraid: long (nullable = true)
 |-- sentid: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- has_citations: long (nullable = true)
 |-- citations: string (nullable = true)
 |-- __index_level_0__: long (nullable = true)



                                                                                

+--------+-----+------+------+--------------------+-------------+-----------------+-----------------+
|    pmid|secid|paraid|sentid|            sentence|has_citations|        citations|__index_level_0__|
+--------+-----+------+------+--------------------+-------------+-----------------+-----------------+
|33313032|    0|     0|     0|India has the sec...|            0|                 |                0|
|33313032|    0|     0|     1|More than 77 mill...|            1|         21959957|                1|
|33313032|    0|     0|     2|These people are ...|            0|                 |                2|
|33313032|    0|     0|     3|Unhealthy lifesty...|            0|                 |                3|
|33313032|    0|     0|     4|Physical activity...|            1|         24571915|                4|
|33313032|    0|     0|     5|Similarly, unheal...|            1|27501784,29459786|                5|
|33313032|    0|     0|     6|Screening of peop...|            1|         25748739

In [4]:
df.createOrReplaceTempView("pubmed_data")
query = "SELECT * FROM pubmed_data WHERE has_citations==1"
result = spark.sql(query)
# result.show()
pandas_df = result.toPandas()
pandas_df.head()

                                                                                

Unnamed: 0,pmid,secid,paraid,sentid,sentence,has_citations,citations,__index_level_0__
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,1,21959957,1
1,33313032,0,0,4,Physical activity levels are low among Indians .,1,24571915,4
2,33313032,0,0,5,"Similarly, unhealthy diets, high in fat (espec...",1,2750178429459786,5
3,33313032,0,0,6,Screening of people who are at high-risk of T2...,1,25748739,6
4,33313032,0,1,1,The prevention and management of chronic disea...,1,26649240,8


In [5]:
pandas_df.drop(columns=['__index_level_0__','has_citations'],inplace=True)
len(pandas_df)

97630

In [6]:
from copy import deepcopy
import pandas as pd
updated_data = []

for idx,row in pandas_df.iterrows():
    citations = row['citations'].split(',')
    for citation in citations:
        new_row = deepcopy(row)
        new_row['citations']=citation
        updated_data.append(new_row)
updated_df = pd.DataFrame(updated_data)
updated_df.head()

Unnamed: 0,pmid,secid,paraid,sentid,sentence,citations
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,21959957
1,33313032,0,0,4,Physical activity levels are low among Indians .,24571915
2,33313032,0,0,5,"Similarly, unhealthy diets, high in fat (espec...",27501784
2,33313032,0,0,5,"Similarly, unhealthy diets, high in fat (espec...",29459786
3,33313032,0,0,6,Screening of people who are at high-risk of T2...,25748739


In [7]:
all_curr_pmids = list(set(updated_df['pmid'].values))
len(all_curr_pmids)

4992

In [8]:
# Score 3: Perfectly relevant list  ---- 10%
perfectly_relevant_df = deepcopy(updated_df)
len(perfectly_relevant_df)

153261

In [9]:
# Generate paragraph citations pairs
from collections import defaultdict
paragraph_pairs = defaultdict(lambda: defaultdict(set))
for idx,row in perfectly_relevant_df.iterrows():
    paragraph_pairs[row['pmid']][row['paraid']].add(row['citations'])

In [10]:
paragraph_pairs['33313032']

defaultdict(set,
            {0: {'15521557',
              '21665111',
              '21959957',
              '23082079',
              '24571915',
              '24609605',
              '25748739',
              '27501784',
              '27547170',
              '29207699',
              '29459786'},
             1: {'15521557', '21665111', '21773016', '25458336', '26649240'},
             3: {'18651193', '24636547', '24661723', '29207699', '32895271'},
             2: {'19622551', '24636547', '27062957', '27370357', '31186666'}})

In [12]:
# Score 2: Somewhat relevant list   --- 20%
# Same paragraph different sentence
import random
somewhat_relevant_data = []
for idx,row in perfectly_relevant_df.iterrows():
    row1 = deepcopy(row)
    row2 = deepcopy(row)

    # Do computation
    same_paragraph_citations = list(paragraph_pairs[row['pmid']][row['paraid']])
    
    same_paragraph_citations.remove(row['citations'])


    # Shuffling the list for maximum randomazation
    random.shuffle(same_paragraph_citations)

    # Appending the list based on length
    if len(same_paragraph_citations)>=1:
        row1['citations'] = same_paragraph_citations[0]
        somewhat_relevant_data.append(row1)
        
    if len(same_paragraph_citations)>1:
        row2['citations'] = same_paragraph_citations[1]
        somewhat_relevant_data.append(row2)
    
    

somewhat_relevant_df = pd.DataFrame(somewhat_relevant_data)
somewhat_relevant_df.head()

Unnamed: 0,pmid,secid,paraid,sentid,sentence,citations
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,24609605
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,29207699
1,33313032,0,0,4,Physical activity levels are low among Indians .,15521557
1,33313032,0,0,4,Physical activity levels are low among Indians .,27547170
2,33313032,0,0,5,"Similarly, unhealthy diets, high in fat (espec...",24609605


In [13]:
# Generate other paragraphs citations
other_paragraphs_list = defaultdict(lambda: defaultdict(set))

for citation in paragraph_pairs.keys():
    all_cites = set()
    all_para_ids = paragraph_pairs[citation].keys()
    for para_id in all_para_ids:
        all_cites|= paragraph_pairs[citation][para_id]
    for para_id in all_para_ids:
        diff =  all_cites.difference(paragraph_pairs[citation][para_id])
        other_paragraphs_list[citation][para_id] = diff
        
print(len(other_paragraphs_list))

4992


In [14]:
# Score 1: Slightly relevant list   --- 30%
# Different paragraph
slightly_relevant_data = []
for idx,row in perfectly_relevant_df.iterrows():
    row1 = deepcopy(row)
    row2 = deepcopy(row)
    row3 = deepcopy(row)
    # Do computation
    other_paragraph_citations = list(other_paragraphs_list[row['pmid']][row['paraid']])

    
    # Shuffling the list for maximum randomazation
    random.shuffle(other_paragraph_citations)

    # Appending the list based on length
    if len(other_paragraph_citations)>=1:
        row1['citations'] = other_paragraph_citations[0]
        slightly_relevant_data.append(row1)
        
    if len(other_paragraph_citations)>1:
        row2['citations'] = other_paragraph_citations[1]
        slightly_relevant_data.append(row2)

    if len(other_paragraph_citations)>2:
        row3['citations'] = other_paragraph_citations[2]
        slightly_relevant_data.append(row3)
    

slightly_relevant_df = pd.DataFrame(slightly_relevant_data)
slightly_relevant_df.head()

Unnamed: 0,pmid,secid,paraid,sentid,sentence,citations
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,18651193
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,32895271
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,19622551
1,33313032,0,0,4,Physical activity levels are low among Indians .,24661723
1,33313032,0,0,4,Physical activity levels are low among Indians .,21773016


In [15]:
len(slightly_relevant_df)

447886

In [16]:
# Score 0: Irrelevant list --- 40%
import random
irrelevant_data = []
for idx,row in perfectly_relevant_df.iterrows():
    irrelevant_cites = []
    for _ in range(4):
        while True:
            pmid = random.choice(all_curr_pmids)
            if pmid not in irrelevant_cites:
                break
        irrelevant_cites.append(pmid)
        new_row = deepcopy(row)
        new_row['citations']=pmid
        irrelevant_data.append(new_row)

irrelevant_df = pd.DataFrame(irrelevant_data)
irrelevant_df.head()

Unnamed: 0,pmid,secid,paraid,sentid,sentence,citations
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,24363967
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,28056765
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,24829583
0,33313032,0,0,1,More than 77 million Indians are at high-risk ...,33976755
1,33313032,0,0,4,Physical activity levels are low among Indians .,29849850


In [17]:
len(irrelevant_df)

613044

In [18]:
# Adding relevance_score to all the dataframes
perfectly_relevant_df['relevance_score']= 3
somewhat_relevant_df['relevance_score']= 2
slightly_relevant_df['relevance_score']= 1
irrelevant_df['relevance_score']= 0

In [19]:
final_df_with_relevance_scores = pd.concat([perfectly_relevant_df, somewhat_relevant_df, slightly_relevant_df,irrelevant_df], ignore_index=True)

In [20]:
len(final_df_with_relevance_scores)

1501254

In [22]:
relevance_score_path = '/home/ubuntu/mypetalibrary/pmoa-cite-dataset/aggregated_dateset/final_df_with_relevance_scores.parquet'
final_df_with_relevance_scores.to_parquet(relevance_score_path)

In [23]:
from_ids = set(final_df_with_relevance_scores['pmid'].unique())
to_ids = set(final_df_with_relevance_scores['citations'].unique())
total_pmids = list(from_ids|to_ids)
len(total_pmids)

111822

In [27]:
import csv
metadata_pmids_path = '/home/ubuntu/mypetalibrary/pmoa-cite-dataset/aggregated_dateset/metadata_pmids.csv'
metadata_pmids_df = pd.DataFrame(total_pmids, columns=['pmids'])
metadata_pmids_df.to_csv(metadata_pmids_path,index=False)