In [1]:
# Sample code to test sentence transformer for cosine similiarity
from sentence_transformers import SentenceTransformer, util

sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
cosine_similarity = util.cos_sim(embeddings[0], embeddings[1])
print(cosine_similarity.item())



0.4045591652393341


In [2]:
# Function to calculate cosine similarity between
from sentence_transformers import SentenceTransformer, util
def cosine_similarity(feature1, feature2):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    feature1_embedding = model.encode(feature1)
    feature2_embedding = model.encode(feature2)
    cosine_similarity = util.cos_sim(feature1_embedding, feature2_embedding)
    return cosine_similarity.item()

In [3]:
# Features list
'''
cosine_pairs = [
    ('q_title','c_title'),
    # ('q_title','c_abstract'),
    # ('q_abstract','c_title'),
    # ('q_abstract','c_abstract'),
    ('q_sentence','c_title'),
    # ('q_sentence','c_abstact')
]
year_difference = q_year-c_year
len_c_title = len(c_title)
len_c_abstract = len(c_abstract)
log_c_in_citations = np.log2(c_in_citations)
'''

"\ncosine_pairs = [\n    ('q_title','c_title'),\n    # ('q_title','c_abstract'),\n    # ('q_abstract','c_title'),\n    # ('q_abstract','c_abstract'),\n    ('q_sentence','c_title'),\n    # ('q_sentence','c_abstact')\n]\nyear_difference = q_year-c_year\nlen_c_title = len(c_title)\nlen_c_abstract = len(c_abstract)\nlog_c_in_citations = np.log2(c_in_citations)\n"

In [4]:
from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, insert
import time
import psycopg2
from psycopg2.extras import DictCursor

conn_params = {
    "host": "10.224.68.29",
    "port": "5432",
    "database": "pubmed",
    "user": "admin",
    "password": "admin"
}

engine = create_engine('postgresql://', connect_args=conn_params)

query = '''
        SELECT rs.pmid as pmid, rs.sentence as q_sentence, rs.cited_id as cited_id, rs.relevance_score as relevance_score,
               m.title as q_title,cited.title as c_title,m.publication_year as q_year,cited.publication_year as c_year,
               cited.cited_by_count as c_in_citations
        FROM relevance_store_new as rs
        LEFT JOIN metadata as m ON m.pmid = rs.pmid
        LEFT JOIN metadata as cited ON cited.pmid = rs.cited_id
        WHERE rs.pmid IN (select DISTINCT pmid from metadata)
          AND rs.cited_id IN (select DISTINCT pmid from metadata);
        '''
start = time.perf_counter()
with psycopg2.connect(**conn_params) as conn:
    with conn.cursor(cursor_factory=DictCursor) as cursor:
        cursor.execute(query)
        records = cursor.fetchall()
end = time.perf_counter()
print(f"Time taken to fetch records with metadata: {end-start} seconds")

Time taken to fetch records with metadata: 0.9195583340000013 seconds


In [5]:
from collections import defaultdict
grouped_records = defaultdict(list)
for record in records:
    grouped_records[record['pmid']].append(record)
print(f"Total groups: {len(grouped_records)}")

Total groups: 25


In [6]:
import numpy as np
embedded_groups = defaultdict(list)
cosine_pairs = [
    ('q_title','c_title'),
    ('q_sentence','c_title')
]
'''
year_difference = q_year-c_year
len_c_title = len(c_title)
len_c_abstract = len(c_abstract)
log_c_in_citations = np.log2(c_in_citations)
'''
start = time.perf_counter()
for k,v in grouped_records.items():
    for record in v:
        embedding_arr = []
        for a,b in cosine_pairs:
            if not record.get(a):
                record[a]=''
            if not record.get(b):
                record[b]=''
            embedding_arr.append(cosine_similarity(record.get(a,''),record.get(b,'')))
        embedding_arr.append(record.get('q_year')-record.get('c_year'))
        embedding_arr.append(len(record.get('c_title','')))
        embedding_arr.append(np.log2(record.get('c_in_citations')))
        embedding_arr.append(record.get('relevance_score'))
        embedded_groups[k].append(embedding_arr)
end = time.perf_counter()
print(f"Time taken to create embeddings for {len(embedded_groups)} groups: {end-start} seconds")

  embedding_arr.append(np.log2(record.get('c_in_citations')))


Time taken to create embeddings for 25 groups: 296.989437709 seconds


In [7]:
for k,v in embedded_groups.items():
    print(k)

20811592
18811945
19664212
21904682
21044324
22110950
24131735
24966823
24989242
19930637
22747670
22778732
23229565
23497105
21762506
22719179
23316194
22474440
22625433
22909256
24636523
21457543
21608150
23797686
24086524


In [8]:
embedded_groups['18811945'][0:5]

[[0.4175010323524475, 0.3005062937736511, 4, 81, 8.108524456778168, 1],
 [0.35630926489830017, 0.21418243646621704, 6, 108, 7.266786540694901, 1],
 [0.5159871578216553, 0.37430161237716675, 10, 111, 8.312882955284355, 1],
 [0.1846732497215271, 0.19310936331748962, 4, 49, 6.554588851677638, 1],
 [0.6058160066604614, 0.6526394486427307, 13, 84, 6.087462841250339, 1]]

In [None]:
# Divide into test and train groups
test_size = 0.2
test_groups_size = int(len(embedded_groups)*test_size)
train_groups_size = len(embedded_groups)-test_groups_size
test_groups = list(embedded_groups.keys())[:test_groups_size]
train_groups = list(embedded_groups.keys())[test_groups_size:]

In [None]:
from collections import Counter
test_data =[]
test_queries = []
for test_group in test_groups:
    for group in embedded_groups[test_group]:
        test_data.append(group)
        test_queries.append(test_group)
X_test = [data[:-1] for data in test_data]
y_test = [data[-1] for data in test_data]
Counter(y_test).items()

In [None]:
train_data =[]
train_queries = []
for train_group in train_groups:
    for group in embedded_groups[train_group]:
        train_data.append(group)
        train_queries.append(train_group)
X_train = [data[:-1] for data in train_data]
y_train = [data[-1] for data in train_data]
Counter(y_train).items()

In [None]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

In [None]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
train = Pool(
    data=X_train,
    label=y_train,
    group_id=train_queries
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=test_queries
)

In [None]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [None]:
from copy import deepcopy
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

In [None]:
model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})