# Notebook to analyse paragraph classification 
Classification carried out using the MatBERT synthesis classifier model from the Ceder group. 160 annotated paragraphs are contained in the dataset

In [1]:
import os
import pandas as pd
import numpy
import ast
import random
import json

In [2]:
#Loading in annotated dataset
path = '/Users/pnt17/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MRes_project_data/para_dataset'
file = 'p_dataset_3.csv'
p_annotated = pd.read_csv(os.path.join(path,file), encoding = 'utf-8')
print(p_annotated.shape)
p_annotated['model class'] =''
p_annotated.head(5)

(161, 3)


Unnamed: 0,DOI,paragraph,class,model class
0,10.1007/s10876-022-02339-x,CQDs were synthesized by the usage of O. basil...,1,
1,10.1002/bio.3407,"First, 0.5 g O-phenylenediamine and 0.5 g dicy...",1,
2,10.1016/j.saa.2022.121139,Si-CQDs were synthesized by a hydrothermal met...,1,
3,10.1039/C6TB00519E,N@C-dots were synthesized via a hydrothermal p...,1,
4,10.1038/s41467-021-25640-1_no_0,The p-GQDs were synthesized following a typica...,1,


In [3]:
print('The number of positive (hydrothermal synthesis) paragraphs in the dataset is', p_annotated['class'].value_counts()[1])
print('The number of negative (non-hydrothermal synthesis) paragraphs in the dataset is', p_annotated['class'].value_counts()[0])

The number of positive (hydrothermal synthesis) paragraphs in the dataset is 87
The number of negative (non-hydrothermal synthesis) paragraphs in the dataset is 74


In [4]:
#Loading in classification results
file = 'classification_results.txt'
with open(os.path.join(path, file),'r', encoding='utf-8') as f:
    contents = f.readlines()
    
p_results = pd.DataFrame(columns = ['DOI', 'text', 'result'])
for line in contents:
    dict_line = ast.literal_eval(line.strip())
    new_df = pd.DataFrame([dict_line])
    p_results = pd.concat([p_results, new_df], axis=0, ignore_index=True)

print(p_results.shape)
p_results.head()

(161, 3)


Unnamed: 0,DOI,text,result
0,10.1007/s10876-022-02339-x,CQDs were synthesized by the usage of O. basi...,"(hydrothermal_ceramic_synthesis, 0.99820625782..."
1,10.1002/bio.3407,"First, 0.5 g O-phenylenediamine and 0.5 g dic...","(hydrothermal_ceramic_synthesis, 0.99682497978..."
2,10.1016/j.saa.2022.121139,Si-CQDs were synthesized by a hydrothermal me...,"(hydrothermal_ceramic_synthesis, 0.99691748619..."
3,10.1039/C6TB00519E,N@C-dots were synthesized via a hydrothermal ...,"(hydrothermal_ceramic_synthesis, 0.99819368124..."
4,10.1038/s41467-021-25640-1_no_0,The p-GQDs were synthesized following a typic...,"(hydrothermal_ceramic_synthesis, 0.99432665109..."


In [5]:
class_result = []
for result in p_results['result'].to_list():
    if result[0] == 'hydrothermal_ceramic_synthesis':
        class_result.append(1)
    else:
        class_result.append(0)

In [6]:
print(len(class_result))
p_results['model class'] = class_result
p_results.head()

161


Unnamed: 0,DOI,text,result,model class
0,10.1007/s10876-022-02339-x,CQDs were synthesized by the usage of O. basi...,"(hydrothermal_ceramic_synthesis, 0.99820625782...",1
1,10.1002/bio.3407,"First, 0.5 g O-phenylenediamine and 0.5 g dic...","(hydrothermal_ceramic_synthesis, 0.99682497978...",1
2,10.1016/j.saa.2022.121139,Si-CQDs were synthesized by a hydrothermal me...,"(hydrothermal_ceramic_synthesis, 0.99691748619...",1
3,10.1039/C6TB00519E,N@C-dots were synthesized via a hydrothermal ...,"(hydrothermal_ceramic_synthesis, 0.99819368124...",1
4,10.1038/s41467-021-25640-1_no_0,The p-GQDs were synthesized following a typic...,"(hydrothermal_ceramic_synthesis, 0.99432665109...",1


In [7]:
for doi in p_results['DOI'].to_list():
    p_annotated.loc[p_annotated['DOI'] == doi,'model class'] = p_results.loc[p_results['DOI']==doi, 'model class'].values

p_annotated.head()

Unnamed: 0,DOI,paragraph,class,model class
0,10.1007/s10876-022-02339-x,CQDs were synthesized by the usage of O. basil...,1,1
1,10.1002/bio.3407,"First, 0.5 g O-phenylenediamine and 0.5 g dicy...",1,1
2,10.1016/j.saa.2022.121139,Si-CQDs were synthesized by a hydrothermal met...,1,1
3,10.1039/C6TB00519E,N@C-dots were synthesized via a hydrothermal p...,1,1
4,10.1038/s41467-021-25640-1_no_0,The p-GQDs were synthesized following a typica...,1,1


True positive(TP) = hydrothermal synthesis paragraph and hydrothermal prediction (1,1)

False negative(FN) = hydrothermal synthesis paragraph and non-hydrothermal prediction (1,0)

True negative(TN) = non-hydrothermal synthesis paragraph and non-hydrothermal prediction (0,0)

False positive(FP) = non-hydrothermal synthesis paragraph and hydrothermal prediction (0,1)

Accuracy = all true predictions / all predictions 

Precision = TP / TP + FP 

Recall = TP / TP + FN 

F1 = 2xPrecisionxRecall / Precision + Recall 

In [8]:
p_annotated['outcome label'] = ''
for doi in p_annotated['DOI'].to_list():
    if p_annotated.loc[p_annotated['DOI'] == doi, 'class'].values == 1 and p_annotated.loc[p_annotated['DOI'] == doi, 'model class'].values == 1:
        p_annotated.loc[p_annotated['DOI'] == doi, 'outcome label'] = 'TP'
    elif p_annotated.loc[p_annotated['DOI'] == doi, 'class'].values == 1 and p_annotated.loc[p_annotated['DOI'] == doi, 'model class'].values == 0:
        p_annotated.loc[p_annotated['DOI'] == doi, 'outcome label'] = 'FN'
    elif p_annotated.loc[p_annotated['DOI'] == doi, 'class'].values == 0 and p_annotated.loc[p_annotated['DOI'] == doi, 'model class'].values == 0:
        p_annotated.loc[p_annotated['DOI'] == doi, 'outcome label'] = 'TN'
    elif p_annotated.loc[p_annotated['DOI'] == doi, 'class'].values == 0 and p_annotated.loc[p_annotated['DOI'] == doi, 'model class'].values == 1:
        p_annotated.loc[p_annotated['DOI'] == doi, 'outcome label'] = 'FP'

p_annotated.head()

Unnamed: 0,DOI,paragraph,class,model class,outcome label
0,10.1007/s10876-022-02339-x,CQDs were synthesized by the usage of O. basil...,1,1,TP
1,10.1002/bio.3407,"First, 0.5 g O-phenylenediamine and 0.5 g dicy...",1,1,TP
2,10.1016/j.saa.2022.121139,Si-CQDs were synthesized by a hydrothermal met...,1,1,TP
3,10.1039/C6TB00519E,N@C-dots were synthesized via a hydrothermal p...,1,1,TP
4,10.1038/s41467-021-25640-1_no_0,The p-GQDs were synthesized following a typica...,1,1,TP


In [9]:
TP = p_annotated['outcome label'].value_counts()['TP']
FP = p_annotated['outcome label'].value_counts()['FP']
TN = p_annotated['outcome label'].value_counts()['TN']
FN = p_annotated['outcome label'].value_counts()['FN']

print(f'TP = {TP}, FP = {FP}, TN = {TN}, FN = {FN}')

TP = 80, FP = 3, TN = 71, FN = 7


In [10]:
#Accuracy
accuracy = (TP + TN) / (TP + FP + TN  + FN)
print(f'The accuracy is {accuracy:.2f}')

The accuracy is 0.94


In [11]:
#Precision
precision = TP / (TP + FP)
print(f'The precision is {precision:.2f}')

The precision is 0.96


In [12]:
#Recall
recall = TP / (TP + FN)
print(f'The recall is {recall:.2f}')

The recall is 0.92


In [13]:
#F1 score
f1_score = 2*precision*recall / (precision+recall)
print(f'The F1 score is {f1_score:.2f}')

The F1 score is 0.94


## Analysis of FN and FP

In [14]:
#False positives
FP_df = p_annotated[p_annotated['outcome label'] == 'FP']
print(FP_df['DOI'].to_list())
FP_df

['10.1016/j.mtchem.2021.100755_no_0', '10.1038/s41598-022-22518-0_no_0', '10.1038/s41598-022-22518-0_no_1']


Unnamed: 0,DOI,paragraph,class,model class,outcome label
85,10.1016/j.mtchem.2021.100755_no_0,GQDs were synthesized by hydrothermal method u...,0,1,FP
126,10.1038/s41598-022-22518-0_no_0,The CQDs were synthesized through the hydrothe...,0,1,FP
127,10.1038/s41598-022-22518-0_no_1,was suspended in 80 mL of deionized water and ...,0,1,FP


10.1038/ncomms6357 - actual hydrothermal method (mislabeled during annotation)

10.1016/j.mtchem.2021.100755_no_0 - fragment of hydrothermal synthesis paragraph, actual synthesis condtions not described but is mentioned that hydrothermal method was used

10.1038/s41598-022-22518-0_no_0, 10.1038/s41598-022-22518-0_no_1 - fragment of hydrothermal synthesis paragraph, mistake in formatting article, the synthesis description is split mid sentence

In [15]:
#False negatives
FN_df = p_annotated[p_annotated['outcome label'] == 'FN']
print(FN_df['DOI'].to_list())
FN_df

['10.1038/s41598-019-55996-w', '10.1080/00032719.2020.1759618_no_0', '10.1016/j.mtchem.2021.100755_no_1', '10.1080/03067319.2021.2004589', '10.1021/acsanm.9b01446_no_0', '10.1021/acsanm.9b01446_no_1', '10.3390/molecules27248728_no_0']


Unnamed: 0,DOI,paragraph,class,model class,outcome label
36,10.1038/s41598-019-55996-w,"Citric acid (5 g), mercaptoethylamine (0.8 g) ...",1,0,FN
78,10.1080/00032719.2020.1759618_no_0,CQDS were synthesized by a simple hydrothermal...,1,0,FN
86,10.1016/j.mtchem.2021.100755_no_1,The syntheses of the GQDs were carried out in ...,1,0,FN
87,10.1080/03067319.2021.2004589,0.3 g m-cresol purple was dissolved in 15 mL a...,1,0,FN
129,10.1021/acsanm.9b01446_no_0,All chemicals were purchased from vendors of e...,1,0,FN
130,10.1021/acsanm.9b01446_no_1,CPDs 2 and CPDs 3 were prepared from aconitic ...,1,0,FN
131,10.3390/molecules27248728_no_0,Fat-free cow milk was used as precursor for CD...,1,0,FN


In [16]:
FN_df['model score'] = ''
for doi in FN_df['DOI'].to_list():
    FN_df.loc[FN_df['DOI']==doi, 'model score'] = p_results.loc[p_results['DOI']== doi, 'result'].values
FN_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FN_df['model score'] = ''


Unnamed: 0,DOI,paragraph,class,model class,outcome label,model score
36,10.1038/s41598-019-55996-w,"Citric acid (5 g), mercaptoethylamine (0.8 g) ...",1,0,FN,"(sol_gel_ceramic_synthesis, 0.9898970127105713)"
78,10.1080/00032719.2020.1759618_no_0,CQDS were synthesized by a simple hydrothermal...,1,0,FN,"(something_else, 0.8870745301246643)"
86,10.1016/j.mtchem.2021.100755_no_1,The syntheses of the GQDs were carried out in ...,1,0,FN,"(something_else, 0.7367516756057739)"
87,10.1080/03067319.2021.2004589,0.3 g m-cresol purple was dissolved in 15 mL a...,1,0,FN,"(something_else, 0.9982753992080688)"
129,10.1021/acsanm.9b01446_no_0,All chemicals were purchased from vendors of e...,1,0,FN,"(something_else, 0.9057037234306335)"
130,10.1021/acsanm.9b01446_no_1,CPDs 2 and CPDs 3 were prepared from aconitic ...,1,0,FN,"(sol_gel_ceramic_synthesis, 0.9961193799972534)"
131,10.3390/molecules27248728_no_0,Fat-free cow milk was used as precursor for CD...,1,0,FN,"(something_else, 0.9990077614784241)"


10.1038/s41598-019-55996-w - model predicts sol-gel, method describes hydrothermal process however experimental not written clearly

10.1080/00032719.2020.1759618_no_0 - model predicts something else, basic hydrothermal method described

10.1016/j.mtchem.2021.100755_no_1 - model predicts something else, hydrothermal method described a bit more complex

10.1080/03067319.2021.2004589 - model predicts something else, basic hydrothermal method described, polytetrafluorethylene reactor mentioned

10.1021/acsanm.9b01446_no_0,10.1021/acsanm.9b01446_no_1 - model predicts something else and sol gel, hydrothermal method described over two paragraphs. Method described a bit more complex

10.3390/molecules27248728_no_0 - model predicts something else, basci hydrothermal method described

10.1038/ncomms6357 - actual hydrothermal method (mislabeled during annotation)

10.1016/j.mtchem.2021.100755_no_0 - fragment of hydrothermal synthesis paragraph, actual synthesis condtions not described but is mentioned that hydrothermal method was used

10.1038/s41598-022-22518-0_no_0, 10.1038/s41598-022-22518-0_no_1 - fragment of hydrothermal synthesis paragraph, mistake in formatting article, the synthesis description is split mid sentence


## Generating doi test lists

In [28]:
#list of DOIs that are TP
TP_hits = p_annotated.loc[p_annotated['outcome label']=='TP', 'DOI'].to_list()
print(len(TP_hits))
print(TP_hits[:6])
with open(os.path.join(path, 'TP_hits.txt'), 'w',encoding='utf-8') as f:
    f.write('\n'.join(i for i in TP_hits))
random.seed(100)

80
['10.1007/s10876-022-02339-x', '10.1002/bio.3407', '10.1016/j.saa.2022.121139', '10.1039/C6TB00519E', '10.1038/s41467-021-25640-1_no_0', '10.1038/s41467-021-25640-1_no_1']


In [33]:
TP_hits_10 = random.sample(TP_hits,10)
print(TP_hits_10)

['10.1080/00032719.2020.1759618_no_1', '10.1016/j.saa.2020.118580', '10.1007/s00216-015-9138-8', '10.1021/acs.jpcc.9b06672', '10.1007/s10876-022-02339-x', '10.1038/ncomms6357', '10.1007/s10895-020-02645-5', '10.1007/s10570-017-1230-0', '10.1021/acssuschemeng.9b00027_no_2', '10.1016/j.optmat.2019.05.045']


In [34]:
with open(os.path.join(path, 'TP_hits_10.txt'), 'w',encoding='utf-8') as f:
    f.write('\n'.join(i for i in TP_hits_10))

## Generating 10 paragraph sample

In [37]:
path_2 = '/Users/pnt17/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MRes_project_data/full_text_tests_json'
file = 'p_dataset_3.txt'
with open(os.path.join(path_2, file),'r', encoding='utf-8') as full_file:
    contents = full_file.readlines()

In [50]:
TP_10_sample = [line for line in contents if any(line.startswith(doi) for doi in TP_hits_10)]

In [51]:
print(len(TP_10_sample))

10


In [52]:
print(TP_10_sample)

['10.1007/s10876-022-02339-x: CQDs were synthesized by the usage of O. basilicum L. extract via a simple hydrothermal method (Fig. 1). In a typical one-step synthesizing procedure, 2.0 g of O. basilicum L. seed was added to 100 mL of distilled water and stirred at 50 °C for 2 h. Then, the obtained extract was filtered and transferred into a 100 mL Teflon-lined stainless-steel autoclave to be heated at 180 °C for 4 h. Once the autoclave was cooled naturally at room temperature and the solution was centrifuged (12,000 rpm) for 15 min, the prepared brown solution was filtered through a fine-grained 0.45 μm membrane to remove larger particles. Finally, the solution was freeze-dried to attain the dark brown powder of CQDs.\n', '10.1007/s00216-015-9138-8: One-pot green synthesis method was applied to prepare nitrogen-doped CNDs by hydrothermal treatment of PVP and glycine as shown in Fig. 1. First, 1 g of PVP and glycine (0, 0.0563, 0.1126, 0.2252, and 0.5630 g) were respectively added to 15

In [60]:
with open(os.path.join(path, 'para_10_sample.txt'),'w', encoding='utf-8') as file:
    file.write(''.join(line for line in TP_10_sample))

# Analysis of performance during dataset generation

In [17]:
# Taking all classification results and compiling into one file/list
path = r'C:\Users\Piotr\OneDrive - Imperial College London\MRes_project_data\paras_id'
batch_numbers = list(range(1,29))
all_para_classifcation_results = []
for batch in batch_numbers:
    filename = f'batch_{batch}_classification_results.txt'
    with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
        data = file.read().splitlines()
        all_para_classifcation_results.extend(data)

In [19]:
all_para_classifcation_results = list(map(ast.literal_eval, all_para_classifcation_results))

In [10]:
print('The number of paragraphs that were classified is: ',len(all_para_classifcation_results))

The number of paragraphs that were classified is:  9303


In [11]:
# # writing all_para_classification_results to file
# with open(os.path.join(path, 'all_classification_results.txt'), 'w', encoding = 'utf-8') as file:
#     for element in all_para_classifcation_results:
#         file.write(element+'\n')

In [13]:
# Getting all paragraphs that were classified and compiling into one file/list
all_paras_text = []
for batch in batch_numbers:
    filename = f'batch_{batch}_paras.txt'
    with open(os.path.join(path, filename), 'r', encoding = 'utf-8') as file:
        data = file.read().splitlines()
        all_paras_text.extend(data)

In [14]:
print(len(all_paras_text))

9303


In [21]:
# Collecting all non-hydrothermal classifications and hydrothermal classifications into separate lists
hydrothermal = []
non_hydrothermal = []
for element in all_para_classifcation_results:
    if element['result'][0] == 'hydrothermal_ceramic_synthesis':
        hydrothermal.append(element['DOI'])
    else:
        non_hydrothermal.append(element['DOI'])

In [22]:
print('The number of  hydrothermal related paragraphs is: ', len(hydrothermal))
print('The number of non-hydrothermal related paragraphs is: ', len(non_hydrothermal))

The number of  hydrothermal related paragraphs is:  3768
The number of non-hydrothermal related paragraphs is:  5535


In [23]:
# Sampling 30 random DOIs from the hydrothermal and non-hydrothermal sets
# hydrothermal_sample = random.choices(hydrothermal, k=30)
# non_hydrothermal_sample = random.choices(non_hydrothermal, k=30)

In [25]:
print(len(hydrothermal_sample))
print(len(non_hydrothermal_sample))

30
30


In [27]:
# Making dataframe of hydrothermal results
hydro_df = pd.DataFrame(columns = ['DOI', 'text'])
hydro_df['DOI'] = hydrothermal_sample
non_hydro_df = pd.DataFrame(columns = ['DOI', 'text'])
non_hydro_df['DOI'] = non_hydrothermal_sample

In [28]:
# Adding text to dataframes
for doi in hydrothermal_sample:
    for para in all_paras_text:
        if para.startswith(doi):
            hydro_df.loc[hydro_df['DOI'] == doi, 'text'] = para.split(':',1)[1]

In [30]:
for doi in non_hydrothermal_sample:
    for para in all_paras_text:
        if para.startswith(doi):
            non_hydro_df.loc[non_hydro_df['DOI'] == doi, 'text'] = para.split(':',1)[1]

In [32]:
hydro_df.to_csv(os.path.join(path, 'hydro_30_sample.csv'), index = False)
non_hydro_df.to_csv(os.path.join(path, 'non_hydro_30_sample.csv'), index = False)

The random sample of 30 relevant paragraphs will be used as the hydrothermal sample instead as it has more FP

In [2]:
# Converting rel_30_sample to csv
path = '/Users/pnt17/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MRes_project_data/ceder_extract_data'
filename = 'rel_30_sample.txt'
rel_dois =[]
rel_text = []
with open(os.path.join(path, filename),'r', encoding='utf-8') as file:
    data = file.read().splitlines()
    
for entry in data:
    rel_dois.append(entry.split(':',1)[0])
    rel_text.append(entry.split(':',1)[1])

rel_hydro_df = pd.DataFrame(columns = ['DOI', 'text'])
rel_hydro_df['DOI'] = rel_dois
rel_hydro_df['text'] = rel_text

In [8]:
path = '/Users/pnt17/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MRes_project_data/paras_id'
# rel_hydro_df.to_csv(os.path.join(path, 'rel_hydro_30_sample.csv'), index = False)

In [9]:
# Importing csv to dataframes
hydro_tagged = pd.read_csv(os.path.join(path, 'rel_hydro_30_sample.csv'))
non_hydro_tagged = pd.read_csv(os.path.join(path,'non_hydro_30_sample.csv'))
frames = [hydro_tagged,non_hydro_tagged]
all_df = pd.concat(frames)
print(all_df.shape)
all_df.head(10)

(60, 3)


Unnamed: 0,DOI,text,tag
0,10.1080/00387010.2020.1764589,CDs were prepared from neera via hydrothermal...,TP
1,10.1007/s10904-019-01232-x_no_1,Pure CuO was prepared via a similar approach....,TP
2,10.1039/C8RA09868A,The color-tunable Zn-doped CDs were synthesiz...,TP
3,10.1039/C6NJ01753C,NS-CDs were synthesized by the hydrothermal c...,TP
4,10.1007/s10895-021-02804-2,There are many procedures for synthesis of th...,TP
5,10.1002/MAME.202100339,"For the synthesis of the N‐CDs, the temperatu...",TP
6,10.3390/ma13184146,Nitrogen-doped carbon dots from lutein were s...,TP
7,10.1021/ac3007939,CPs were prepared by hydrothermal treatment o...,TP
8,10.1039/C5AY01715G,"Typically, 0.5 g of urea and 0.16 g of citric...",TP
9,10.1039/D1RA01478A,The CDs were prepared by a modified hydrother...,TP


In [10]:
# Calculating accuracy, precision, recall, F1
TP = all_df['tag'].value_counts()['TP']
TN = all_df['tag'].value_counts()['TN']
FN = all_df['tag'].value_counts()['FN']
FP = all_df['tag'].value_counts()['FP']

accuracy = (TP + TN) / (TP + FP + TN  + FN)
print(f'The accuracy is {accuracy:.2f}')

precision = TP / (TP + FP)
print(f'The precision is {precision:.2f}')

recall = TP / (TP + FN)
print(f'The recall is {recall:.2f}')

f1_score = 2*precision*recall / (precision+recall)
print(f'The F1 score is {f1_score:.2f}')

The accuracy is 0.93
The precision is 0.90
The recall is 0.96
The F1 score is 0.93
