In [3]:
import glob
import ast
import pandas as pd


processed_files_list = glob.glob('raw_batch_data/processed_batch_*.csv')

#### Load the data

In [5]:
# For now read the first one, later we will create
# a script to pull the data, processed it and store it
# in a folder.
data = pd.read_csv(processed_files_list[0])

In [219]:
data['DescriptorName'] = data['DescriptorName'].apply(ast.literal_eval)

In [220]:
data_large = data.explode('DescriptorName')

In [221]:
# We don't care about the missing descriptor name cases for our analysis. Because,
# we need it to find the root category.
data_large = data_large[~data_large['DescriptorName'].isna()]

In [222]:
dn = pd.read_csv('descriptor_mapper.csv')

In [223]:
dn.head()

Unnamed: 0,Descriptor Name,Root Category,Descriptor Name Lower
0,(4-(m-Chlorophenylcarbamoyloxy)-2-butynyl)trim...,['D'],(4-(m-chlorophenylcarbamoyloxy)-2-butynyl)trim...
1,"1,2-Dihydroxybenzene-3,5-Disulfonic Acid Disod...",['D'],"1,2-dihydroxybenzene-3,5-disulfonic acid disod..."
2,"1,2-Dimethylhydrazine",['D'],"1,2-dimethylhydrazine"
3,"1,2-Dipalmitoylphosphatidylcholine",['D'],"1,2-dipalmitoylphosphatidylcholine"
4,"1,4-alpha-Glucan Branching Enzyme",['D'],"1,4-alpha-glucan branching enzyme"


In [224]:
dn['Root Category'] = dn['Root Category'].apply(ast.literal_eval)

#### The problem with a left join

In [225]:
data_merged = data_large.merge(dn, left_on='DescriptorName', right_on='Descriptor Name', how='left')

In [226]:
data_merged.head()

Unnamed: 0,PMID,date_completed,NumberOfReferences,AbstractText,ArticleTitle,AuthorList,DescriptorName,QualifierName,Descriptor Name,Root Category,Descriptor Name Lower
0,25192366,2021-06-25,0,"The link between obesity and diabetes, hyperte...",Trends in Body-Mass Index After Donor Nephrect...,"{'Author': {'LastName': ['Movassaghi', 'Dru', ...",Body Mass Index,"['', '', '', 'adverse effects', '', 'adverse e...",Body Mass Index,[EGN],body mass index
1,25192366,2021-06-25,0,"The link between obesity and diabetes, hyperte...",Trends in Body-Mass Index After Donor Nephrect...,"{'Author': {'LastName': ['Movassaghi', 'Dru', ...",Humans,"['', '', '', 'adverse effects', '', 'adverse e...",Humans,[B],humans
2,25192366,2021-06-25,0,"The link between obesity and diabetes, hyperte...",Trends in Body-Mass Index After Donor Nephrect...,"{'Author': {'LastName': ['Movassaghi', 'Dru', ...",Kidney,"['', '', '', 'adverse effects', '', 'adverse e...",Kidney,[A],kidney
3,25192366,2021-06-25,0,"The link between obesity and diabetes, hyperte...",Trends in Body-Mass Index After Donor Nephrect...,"{'Author': {'LastName': ['Movassaghi', 'Dru', ...",Kidney Transplantation,"['', '', '', 'adverse effects', '', 'adverse e...",Kidney Transplantation,[E],kidney transplantation
4,25192366,2021-06-25,0,"The link between obesity and diabetes, hyperte...",Trends in Body-Mass Index After Donor Nephrect...,"{'Author': {'LastName': ['Movassaghi', 'Dru', ...",Living Donors,"['', '', '', 'adverse effects', '', 'adverse e...",Living Donors,[M],living donors


In [227]:
data_merged.shape

(2747003, 11)

In [228]:
data_merged[data_merged['Root Category'].isna()].head()

Unnamed: 0,PMID,date_completed,NumberOfReferences,AbstractText,ArticleTitle,AuthorList,DescriptorName,QualifierName,Descriptor Name,Root Category,Descriptor Name Lower
42,25039577,2020-08-20,0,To evaluate the efficacy of electric and conve...,Assessment of the efficacy of the utilisation ...,"{'Author': {'LastName': ['Nobre', 'Gomes', 'Go...",Female,"['', '', '', '', '', '', '', '', '', '']",,,
44,25039577,2020-08-20,0,To evaluate the efficacy of electric and conve...,Assessment of the efficacy of the utilisation ...,"{'Author': {'LastName': ['Nobre', 'Gomes', 'Go...",Male,"['', '', '', '', '', '', '', '', '', '']",,,
49,25163772,2020-04-15,0,We compared the incidence of renal simple cyst...,Presence of Renal Simple Cysts Is Associated W...,"{'Author': {'LastName': ['Song', 'Park'], 'For...",Female,"['', 'diagnostic imaging', '', '', '', 'compli...",,,
53,25163772,2020-04-15,0,We compared the incidence of renal simple cyst...,Presence of Renal Simple Cysts Is Associated W...,"{'Author': {'LastName': ['Song', 'Park'], 'For...",Male,"['', 'diagnostic imaging', '', '', '', 'compli...",,,
66,25134450,2020-01-01,0,,The CROWN Initiative: Journal editors invite r...,"{'Author': {'LastName': ['Khan', ''], 'ForeNam...",Female,"['', '', '', '', '', '', '', 'methods', '', ''...",,,


In [229]:
data_merged[data_merged['Root Category'].isna()]['DescriptorName'].unique()

array(['Female', 'Male'], dtype=object)

For now there is no MeSH root category for female and male so we will leave them out.

#### Inner Join the data

In [230]:
data_merged = data_large.merge(dn, left_on='DescriptorName', right_on='Descriptor Name', how='inner')

In [231]:
import numpy as np
root_data = data_merged\
    .groupby('PMID')['Root Category']\
    .apply(list)\
    .apply(np.array, dtype='object')\
    .apply(lambda x: set(''.join(x.flatten())))\
    .to_frame()\
    .reset_index()\
    .explode('Root Category')

In [232]:
root_data['Values'] = 1

In [233]:
root_data.head()

Unnamed: 0,PMID,Root Category,Values
0,25011618,D,1
0,25011618,G,1
0,25011618,B,1
0,25011618,L,1
0,25011618,E,1


In [234]:
finish_data = pd.pivot_table(root_data, values='Values', index='PMID', columns='Root Category', fill_value=0)

In [235]:
finish_data

Root Category,A,B,C,D,E,F,G,H,I,J,K,L,M,N,Z
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
25011618,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
25011619,0,1,0,1,1,0,1,0,0,1,0,1,0,1,0
25011620,1,1,0,0,1,0,1,0,1,0,0,1,1,1,0
25011621,1,1,0,0,1,0,1,0,0,1,0,1,1,0,0
25011622,1,1,1,0,1,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25335586,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1
25335587,0,1,0,0,0,0,0,1,0,0,1,0,1,1,1
25335588,0,1,0,0,0,0,0,0,1,1,0,0,0,1,1
25335589,1,1,0,0,1,0,0,0,1,0,0,0,1,1,1


In [236]:
meta_data = data_merged[['PMID', 'date_completed', 'NumberOfReferences', 'AbstractText', 'ArticleTitle', 'AuthorList']].drop_duplicates()

In [237]:
final_data = meta_data.merge(finish_data, on='PMID', how='inner')

In [238]:
final_data.shape

(234190, 21)

We have decided to remove missing abstract text ones since, in the future we will be training our models on the abstract text.

In [239]:
final_data['AbstractText'].isna().sum()

34797

In [240]:
final_data = final_data[~final_data['AbstractText'].isna()]

In [241]:
final_data.tail()

Unnamed: 0,PMID,date_completed,NumberOfReferences,AbstractText,ArticleTitle,AuthorList,A,B,C,D,...,F,G,H,I,J,K,L,M,N,Z
234175,25083864,2015-05-02,0,AAA+ proteases are responsible for protein deg...,"Chance, destiny, and the inner workings of ClpXP.","{'Author': {'LastName': ['Russell', 'Matousche...",0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
234177,25165917,2015-12-05,0,In recent experiments at the velocity filter S...,Signatures of the Z = 82 shell closure in α-de...,"{'Author': {'LastName': ['Andreyev', 'Huyse', ...",0,0,0,1,...,0,1,1,0,0,0,0,0,0,0
234178,25294248,2015-10-04,0,Unlike a new generation of scientists that are...,Learning physics of living systems from Dictyo...,"{'Author': {'LastName': ['Levine'], 'ForeName'...",0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
234181,25192429,2016-03-17,0,While predetermined débitage technologies are ...,Predetermined flake production at the Lower/Mi...,"{'Author': {'LastName': ['Shimelmitz', 'Kuhn',...",0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
234183,25209872,2015-05-20,0,Use of the modern parallel programming languag...,Resolutions of the Coulomb operator: VIII. Par...,"{'Author': {'LastName': ['Limpanuparb', 'Milth...",0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [242]:
final_data.shape

(199393, 21)

In [243]:
import pathlib
pathlib.Path('./training_data').mkdir(parents=True, exist_ok=True)

In [244]:
final_data.to_csv('./training_data/training_batch_800_810.csv', index=False)