In [4]:
from pyspark.sql import SparkSession
spark_path = "/home/shsa3327"
spark = SparkSession.builder\
    .config('spark.driver.memory', '40g')\
    .config('spark.executor.memory', '20g')\
    .config('spark.executor.cores', '30')\
    .config('spark.local.dir', f'{spark_path}/tmp') \
    .config('spark.driver.maxResultSize', '40g')\
    .config("spark.driver.bindAddress", "0.0.0.0")\
    .config("spark.sql.parquet.columnarReaderBatchSize", "1024") \
    .config("spark.sql.parquet.enableVectorizedReader", "true") \
    .config('spark.driver.extraJavaOptions', f'-Djava.io.tmpdir={spark_path}/tmp') \
    .config('spark.executor.extraJavaOptions', f'-Djava.io.tmpdir={spark_path}/tmp') \
    .config('hive.exec.scratchdir', f'{spark_path}/tmp/hive') \
    .enableHiveSupport() \
    .getOrCreate()

24/04/21 16:14:01 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
spark.stop()

In [5]:
combined_df_with_q_c_metadata = '/home/shsa3327/mypetalibrary/pmoa-cite-dataset/aggregated_dateset/combined_pubmed.parquet'
combined_df_read = spark.read.parquet(combined_df_with_q_c_metadata)
combined_df_read.count()

                                                                                

1501454

In [6]:
combined_df_read.columns

['pmid',
 'secid',
 'paraid',
 'sentid',
 'sentence',
 'citations',
 'relevance_score',
 'q_pmid',
 'q_doi',
 'q_title',
 'q_abstract',
 'q_publication_year',
 'q_cited_by_count',
 'c_pmid',
 'c_doi',
 'c_title',
 'c_abstract',
 'c_publication_year',
 'c_cited_by_count']

In [13]:
# Function to calculate cosine similarity between
from sentence_transformers import SentenceTransformer, util
import torch
from functools import lru_cache
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
@lru_cache(maxsize=None)
def cosine_similarity(feature1, feature2):
    # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    feature1_embedding = model.encode(feature1)
    feature2_embedding = model.encode(feature2)
    cosine_similarity = util.cos_sim(feature1_embedding, feature2_embedding)
    return cosine_similarity.item()

In [9]:
from collections import defaultdict
grouped_records = defaultdict(list)
collected_data = combined_df_read.collect()
for record in collected_data:
    grouped_records[record['pmid']].append(record)
print(f"Total groups: {len(grouped_records)}")

                                                                                

Total groups: 4992


In [14]:
import numpy as np
from tqdm import tqdm
import time
embedded_groups = []
cosine_pairs = [
    ('q_title','c_title'),
    ('q_title','c_abstract'),
    ('q_abstract','c_title'),
    ('q_abstract','c_abstract'),
    ('sentence','c_abstract'),
    ('sentence','c_title')
]
'''
year_difference = q_year-c_year
len_c_title = len(c_title)
len_c_abstract = len(c_abstract)
log_c_in_citations = np.log2(c_in_citations)
'''
start = time.perf_counter()
for k,v in tqdm(grouped_records.items()):
    for record in v:
        embedding_arr = []
        record = record.asDict()
        for a,b in cosine_pairs:
            if not record.get(a):
                record[a]=''
            if not record.get(b):
                record[b]=''
            embedding_arr.append(cosine_similarity(record.get(a,''),record.get(b,'')))

        if not record.get('c_publication_year'):
            record['c_publication_year']=0
        if not record.get('q_publication_year'):
            record['q_publication_year']=0    
        embedding_arr.append(record.get('q_publication_year')-record.get('c_publication_year'))
        embedding_arr.append(len(record.get('q_title','')))
        embedding_arr.append(len(record.get('c_title','')))
        embedding_arr.append(len(record.get('q_abstract','')))
        embedding_arr.append(len(record.get('c_abstract','')))
        if record.get('c_in_citations'):
            embedding_arr.append(np.log2(record.get('c_in_citations')))
        else:
            embedding_arr.append(0)
        embedding_arr.append(record.get('relevance_score'))
        embedded_groups.append([k] + embedding_arr)
end = time.perf_counter()
print(f"Time taken to create embeddings for {len(embedded_groups)} groups: {end-start} seconds")

  0%|█▎                                                                                                                                                                                                                                                                                    | 23/4992 [06:51<24:41:32, 17.89s/it]


KeyboardInterrupt: 

In [48]:
import pandas as pd
df = spark.createDataFrame(pd.DataFrame(embedded_groups))
df.write.parquet("data.parquet")

In [None]:
# Divide into test and train groups
test_size = 0.2
test_groups_size = int(len(embedded_groups)*test_size)
train_groups_size = len(embedded_groups)-test_groups_size
test_groups = list(embedded_groups.keys())[:test_groups_size]
train_groups = list(embedded_groups.keys())[test_groups_size:]

In [None]:
from collections import Counter
test_data =[]
test_queries = []
for test_group in test_groups:
    for group in embedded_groups[test_group]:
        test_data.append(group)
        test_queries.append(test_group)
X_test = [data[:-1] for data in test_data]
y_test = [data[-1] for data in test_data]
Counter(y_test).items()

In [None]:
train_data =[]
train_queries = []
for train_group in train_groups:
    for group in embedded_groups[train_group]:
        train_data.append(group)
        train_queries.append(train_group)
X_train = [data[:-1] for data in train_data]
y_train = [data[-1] for data in train_data]
Counter(y_train).items()

In [None]:
max_relevance = max(np.max(y_train),np.max(y_test))
y_train /= max_relevance
y_test /= max_relevance

In [None]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
train = Pool(
    data=X_train,
    label=y_train,
    group_id=train_queries
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=test_queries
)

In [None]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [None]:
from copy import deepcopy
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model