In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import networkx as nx

In [2]:
DATA_FOLDER = '../../data/'

In [3]:
with open(f'{DATA_FOLDER}papers.pickle', 'rb') as handle:
    papers = pickle.load(handle)

In [4]:
papers_df = pd.DataFrame.from_dict(papers)

In [5]:
with open(f'{DATA_FOLDER}organization_paper_location_df.pickle', 'rb') as handle:
    organization_paper_location_df = pickle.load(handle)


In [6]:
organization_paper_location_df

Unnamed: 0,title,organization,location
0,Predicting lncRNA-protein interactions by mach...,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2..."
1,Predicting lncRNA-protein interactions by mach...,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市..."
2,Recent advances in predicting protein-lncRNA i...,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005..."
3,Recent advances in predicting protein-lncRNA i...,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40..."
4,Recent advances in predicting protein-lncRNA i...,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ..."
...,...,...,...
87,LncRNA-ID: Long non-coding RNA IDentification ...,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ..."
88,LncRNA-ID: Long non-coding RNA IDentification ...,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central..."
89,PRPI-SC: an ensemble deep learning model for p...,Jomo Kenyatta University of Agriculture and Te...,(Jomo Kenyatta University of Agriculture and T...
90,Linc2function: A Comprehensive Pipeline and We...,Monash University,"(Monash University, Mile Lane, International H..."


In [7]:
len(organization_paper_location_df['title'])

89

In [8]:
organization_paper_location_df['title'].nunique()

46

We have duplicate organization/paper records. This is caused by the paper having multiple authors, those authors in turn are associated with different organizations.

The duplicate records will be dropped, however this will lead to papers being associated with organization of authors which might not be the true 1st author. To resolve this, we will need to identify the 1st author of papers in previous data processing.

In [9]:
organization_paper_location_df.drop_duplicates(['title'], inplace=True)

In [10]:
organization_paper_location_abstract_df = pd.merge(organization_paper_location_df, papers_df[['title', 'abstract']], on='title', how='inner')

In [11]:
organization_paper_location_abstract_df = organization_paper_location_abstract_df.dropna(subset=['abstract'])  # Remove entries with missing abstracts

In [12]:
organization_paper_location_abstract_df

Unnamed: 0,title,organization,location,abstract
0,Predicting lncRNA-protein interactions by mach...,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...","Here, we aim to provide a review of machine-le..."
1,Recent advances in predicting protein-lncRNA i...,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",classified into the deep learning-based method...
2,Prediction of plant lncRNA by ensemble machine...,Roche,"(Roche, La Tour-du-Pin, Isère, Auvergne-Rhône-...",Multiple machine learning approaches to lncRNA...
3,Long non-coding RNA and RNA-binding protein in...,Qatar Foundation,"(المؤسسة القطرية - كابينة 3, شارع 2730, المدين...",interplay between lncRNAs and lncRNAs and RBP...
4,A four-methylated LncRNA signature predicts su...,IUB,"(Iub, Dollo, ሶማሌ ክልል / Somali, ኢትዮጵያ, (8.23333...",In order to identify the optimal prognostic si...
5,LncMachine: a machine learning algorithm for l...,Stanford University,"(Stanford University, 408, Panama Mall, Stanfo...",We evaluated the performance of machine learni...
6,CRlncRC: a machine learning-based method for c...,Columbia University,"(Columbia University, Broadway, Manhattan Comm...",learning models on measurements of model sensi...
7,Machine learning-based identification of tumor...,The Second Affiliated Hospital,"(深圳市第二人民医院, 泥岗西路, 黄木岗社区, 华富街道, 福田区, 深圳市, 广东省, ...",lncRNAs lncRNA (TIIClncRNA) in low-grade gliom...
8,Evaluation of machine learning models that pre...,Computer Science & Electrical Engineering,"(Kenneth H Keller Hall, 200, Southeast Union S...",Our literature survey identified machine learn...
9,Machine learning-based construction of a ferro...,Beijing Institute of Technology,"(北京理工大学, 5, 中关村南大街, 北下关街道, 海淀区, 北京市, 100872, 中...",We have identified lncrna related to iron deat...


In [13]:
with open(f'{DATA_FOLDER}organization_paper_location_abstract_df.pickle', 'wb') as handle:
    pickle.dump(organization_paper_location_abstract_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Generate abstract similarity using spacy

In [14]:
# Extract abstracts
abstracts = organization_paper_location_abstract_df['abstract'].tolist()

In [15]:
# Load spaCy model
nlp = spacy.load("en_core_web_md")

In [16]:
# Compute similarity matrix
docs = [nlp(abstract) for abstract in abstracts]
n = len(docs)
similarity_matrix = np.zeros((n, n))

In [17]:
for i in range(n):
    for j in range(n):
        similarity_matrix[i, j] = docs[i].similarity(docs[j])

In [18]:
with open(f'{DATA_FOLDER}similarity_matrix.pickle', 'wb') as handle:
    pickle.dump(similarity_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)