### Combine pubmed and metadata file

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
    .config('spark.driver.memory', '50g')\
    .config('spark.executor.memory', '20g')\
    .config('spark.local.dir', '/home/ubuntu/tmp') \
    .config('spark.driver.maxResultSize', '100g')\
    .config("spark.sql.parquet.columnarReaderBatchSize", "1024") \
    .config("spark.sql.parquet.enableVectorizedReader", "true") \
    .config('spark.driver.extraJavaOptions', '-Djava.io.tmpdir=/home/ubuntu/tmp') \
    .config('spark.executor.extraJavaOptions', '-Djava.io.tmpdir=/home/ubuntu/tmp') \
    .config('hive.exec.scratchdir', '/home/ubuntu/tmp/hive') \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/15 16:03:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/15 16:03:32 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [2]:
from pyspark.sql.types import IntegerType, StringType
# Input the relevant metada file path here
metadata_path = "final_metadata_combined.parquet"
metadata = spark.read.parquet(metadata_path)
metadata = metadata.withColumn("PubMedID", metadata["PubMedID"].cast(StringType()))
# metadata.to_parquet('final_metadata_combined_modified.parquet')
metadata.count()

                                                                                

92603

In [3]:
relevance_score_path = '/home/ubuntu/mypetalibrary/pmoa-cite-dataset/aggregated_dateset/final_df_with_relevance_scores.parquet'
relevance_score = spark.read.parquet(relevance_score_path)
relevance_score.count()

1501254

In [4]:
metadata.first()

Row(PubMedID='12414282', doi='https://doi.org/10.1006/nimg.2002.1267', title='Dysmyelination Revealed through MRI as Increased Radial (but Unchanged Axial) Diffusion of Water', abstract_inverted_index='Myelin loss and axonal damage are both observed in white matter injuries. Each may have significant impact on the long-term disability of patients. Currently, there does not exist a noninvasive biological marker that enables differentiation between myelin and axonal injury. We describe herein the use of magnetic resonance diffusion tensor imaging (DTI) to quantify the effect of dysmyelination on water directional diffusivities in brains of shiverer mice in vivo. The principal diffusion eigenvalues of eight axonal fiber tracts that can be identified with certainty on DTI maps were measured. The water diffusivity perpendicular to axonal fiber tracts, lambda(perpendicular), was significantly higher in shiverer mice compared with age-matched controls, reflecting the lack of myelin and the in

In [5]:
relevance_score.first()

                                                                                

Row(pmid='33313032', secid=0, paraid=0, sentid=1, sentence='More than 77 million Indians are at high-risk of T2DM—their blood glucose levels are higher than normal but lower than the established threshold for T2DM itself .', citations='21959957', relevance_score=3)

In [6]:
combined_df = relevance_score.join(metadata, relevance_score.pmid == metadata.PubMedID, how='left')

In [7]:
combined_df.first()

                                                                                

Row(pmid='33313032', secid=0, paraid=0, sentid=1, sentence='More than 77 million Indians are at high-risk of T2DM—their blood glucose levels are higher than normal but lower than the established threshold for T2DM itself .', citations='21959957', relevance_score=3, PubMedID='33313032', doi='https://doi.org/10.3389/fpubh.2020.548674', title='Development of a Yoga Program for Type-2 Diabetes Prevention (YOGA-DP) Among High-Risk People in India', abstract_inverted_index='Introduction: Many Indians are at high-risk of type-2 diabetes mellitus (T2DM). Yoga is an ancient Indian mind-body discipline, that has been associated with improved glucose levels and can help to prevent T2DM. The study aimed to systematically develop a Yoga program for T2DM prevention (YOGA-DP) among high-risk people in India using a complex intervention development approach. Materials and Methods: As part of the intervention, we developed a booklet and a high-definition video for participants and a manual for YOGA-DP 

In [10]:
combined_df.columns
combined_df = combined_df.withColumnRenamed('PubMedID', 'q_pmid') \
                         .withColumnRenamed('doi', 'q_doi') \
                         .withColumnRenamed('title', 'q_title') \
                         .withColumnRenamed('abstract_inverted_index', 'q_abstract') \
                         .withColumnRenamed('publication_year', 'q_publication_year') \
                         .withColumnRenamed('cited_by_count', 'q_cited_by_count')

In [11]:
combined_df.columns

['pmid',
 'secid',
 'paraid',
 'sentid',
 'sentence',
 'citations',
 'relevance_score',
 'q_pmid',
 'q_doi',
 'q_title',
 'q_abstract',
 'q_publication_year',
 'q_cited_by_count']

In [12]:
combined_df_with_cited_data = combined_df.join(metadata, combined_df.citations == metadata.PubMedID, how='left')

In [14]:
combined_df_with_cited_data.first()

                                                                                

Row(pmid='33313032', secid=0, paraid=0, sentid=1, sentence='More than 77 million Indians are at high-risk of T2DM—their blood glucose levels are higher than normal but lower than the established threshold for T2DM itself .', citations='21959957', relevance_score=3, q_pmid='33313032', q_doi='https://doi.org/10.3389/fpubh.2020.548674', q_title='Development of a Yoga Program for Type-2 Diabetes Prevention (YOGA-DP) Among High-Risk People in India', q_abstract='Introduction: Many Indians are at high-risk of type-2 diabetes mellitus (T2DM). Yoga is an ancient Indian mind-body discipline, that has been associated with improved glucose levels and can help to prevent T2DM. The study aimed to systematically develop a Yoga program for T2DM prevention (YOGA-DP) among high-risk people in India using a complex intervention development approach. Materials and Methods: As part of the intervention, we developed a booklet and a high-definition video for participants and a manual for YOGA-DP instructors

In [17]:
combined_df_with_cited_data.columns
combined_df_with_cited_data = combined_df_with_cited_data.withColumnRenamed('PubMedID', 'c_pmid') \
                                                         .withColumnRenamed('doi', 'c_doi') \
                                                         .withColumnRenamed('title', 'c_title') \
                                                         .withColumnRenamed('abstract_inverted_index', 'c_abstract') \
                                                         .withColumnRenamed('publication_year', 'c_publication_year') \
                                                         .withColumnRenamed('cited_by_count', 'c_cited_by_count')

In [19]:
combined_df_with_cited_data.count()

1501454

In [20]:
combined_df_with_q_c_metadata = '/home/ubuntu/mypetalibrary/pmoa-cite-dataset/aggregated_dateset/combined_pubmed.parquet'
combined_df_with_cited_data.write.parquet(combined_df_with_q_c_metadata)

                                                                                

In [21]:
combined_df_with_q_c_metadata = '/home/ubuntu/mypetalibrary/pmoa-cite-dataset/aggregated_dateset/combined_pubmed.parquet'
combined_df_read = spark.read.parquet(combined_df_with_q_c_metadata)
combined_df_read.count()

1501454

In [25]:
entry = combined_df_read.first()
entry.q_abstract

'Abstract Proteins, made up of either single or multiple chains, are designed to carry out specific biological functions. We found an interesting example of a two-chain protein where administration of one of its chains leads to a diametrically opposite outcome than that reported for the full-length protein. Clusterin is a highly glycosylated protein consisting of two chains, α- and β-clusterin. We have investigated the conformational features, cellular localization, lipid accumulation, in vivo effects and histological changes upon administration of recombinant individual chains of clusterin. We demonstrate that recombinant α- and β-chains exhibit structural and functional differences and differ in their sub-cellular localization. Full-length clusterin is known to lower lipid levels. In contrast, we find that β-chain-treated cells accumulate 2-fold more lipid than controls. Interestingly, α-chain-treated cells do not show such increase. Rabbits injected with β-chain, but not α-chain, sh