# Data cleaning
Notebook to clean generated dataset for better analysis. Primary objectives are to 

1. remove duplicates 
2. standardize material name
3. standardize value units

In [262]:
import os
import pandas as pd
import json
import re
from collections import Counter
import ast

In [263]:
#Data import
path = '/Users/pnt17/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MRes_project_data/ceder_extract_data'
# path = r'C:\Users\Piotr\OneDrive - Imperial College London\MRes_project_data\ceder_extract_data'
# file = 'data_batch_2.json'
file = 'all_extracted_data.json'
with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

In [264]:
print(len(raw_data))

3789


## 1. Duplicate removal

In [265]:
dois = []
dois_2 = []
for entry in raw_data:
    dois.append(entry['doi'].lower())
    dois_2.append(entry['doi'])
case_duplicates = [k for k,v in Counter(dois).items() if v>1]
duplicates = [k for k,v in Counter(dois_2).items() if v>1]
print(len(case_duplicates))
print(case_duplicates)
print(len(duplicates))
print(duplicates)

22
['10.1007/s10934-014-9850-3', '10.1007/s00339-015-9027-1', '10.1007/s12649-018-0426-3', '10.1007/s12649-018-00561-1', '10.1007/s00339-018-1894-9', '10.1007/s10948-015-3231-2_no_0', '10.1007/s00339-015-9164-6_no_0', '10.1007/s00339-015-9164-6_no_1', '10.1007/s10404-016-1825-z', '10.1039/c9ay01794a', '10.1039/c3nj01320k_no_0', '10.1039/c3nj01320k_no_1', '10.1007/s10404-016-1763-9', '10.1007/s12648-015-0741-5_no_0', '10.1007/s40789-016-0124-3', '10.1007/s11837-019-03372-4_no_0', '10.1007/s11837-013-0607-2_no_0', '10.1007/s11837-013-0607-2_no_1', '10.1007/s11837-013-0607-2_no_2', '10.1007/s13399-019-00387-4_no_0', '10.1007/s00339-012-6774-0', '10.1007/s42452-019-0178-0']
3
['10.1039/C9AY01794A', '10.1039/C3NJ01320K_no_0', '10.1039/C3NJ01320K_no_1']


In [266]:
for doi in duplicates:
    if doi.lower() in case_duplicates:
        case_duplicates.remove(doi.lower())
print(len(case_duplicates))

19


In [267]:
data = []
removed = []
for entry in raw_data:
    if entry['doi'] in case_duplicates:
        continue
    if entry['doi'] in duplicates and entry['doi'] not in removed:
        removed.append(entry['doi'])
        continue
    else:
        data.append(entry)
print(len(data))

3767


In [268]:
dois = []
dois_2 = []
for entry in data:
    dois.append(entry['doi'].lower())
    dois_2.append(entry['doi'])
case_duplicates = [k for k,v in Counter(dois).items() if v>1]
duplicates = [k for k,v in Counter(dois_2).items() if v>1]
print(len(case_duplicates))
print(case_duplicates)
print(len(duplicates))
print(duplicates)

0
[]
0
[]


In [269]:
# with open(os.path.join(path,'extracted_dois.txt'), 'w', encoding = 'utf-8') as file:
#     file.write('\n'.join(doi for doi in dois_2))

# with open(os.path.join(path, 'all_extracted_data_clean.json'), 'w', encoding = 'utf-8') as file:
#     json.dump(data, file, indent=4,sort_keys=True,ensure_ascii=False)

## Removing non carbon dot related entries
Each doi has a tag relating to relevance to CDs based on abstract and materials mentioned in synthesis procedure

Legend:

(0,0) no mention of CDs in abstract or materials

(1,0) mention of CDs in abstract but no mention in materials

(0,1) no mention of CDs in abstract but mentioned in materials

(1,1) mention of CDs both in absrtact and materials

In [270]:
#importing filter results
# path_abs = r'C:\Users\Piotr\OneDrive - Imperial College London\MRes_project_data\abstracts'
path_abs = '/Users/pnt17/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MRes_project_data/abstracts'
filter_results = []
with open(os.path.join(path_abs, 'filter_results.txt'),'r', encoding = 'utf-8') as file:
    for line in file:
        line = line.strip()
        result = ast.literal_eval(line)
        filter_results.append(result)
print(len(filter_results))

3767


In [271]:
relevant_dois = []
abs_only = []
mat_only = []
both = []
not_rel = []
for result in filter_results:
    if result[1] == 1 and result[2] == 1:
        both.append(result[0])
        relevant_dois.append(result[0])
    elif result[1] == 1 and result[2] == 0:
        abs_only.append(result[0])
        relevant_dois.append(result[0])
    elif result[1] == 0 and result[2] == 1:
        mat_only.append(result[0])
        not_rel.append(result[0])
    elif result[1] == 0 and result[2] == 0:
        not_rel.append(result[0])

In [272]:
print(len(relevant_dois) + len(not_rel))
print(len(relevant_dois))
print(len(not_rel))
print(len(abs_only))
print(len(mat_only))
print(len(both))

3767
1904
1863
1062
12
842


In [273]:
print(mat_only)

['10.1016/j.applthermaleng.2021.116580', '10.1021/CM504042S', '10.1007/s10854-019-01920-x', '10.1007/s10934-021-01053-9', '10.1007/s10854-021-06057-4', '10.1039/D2TB01304E_no_1', '10.1007/s11249-017-0923-8_no_0', '10.1002/pssa.202100808_no_0', '10.1038/srep02925', '10.1002/adfm.202009197', '10.1016/j.microc.2021.106273', '10.1002/adfm.201900093_no_0']


All dois that had no mention of CDs in abstract but mention in materials are not relevant to CDs (most time CD is abbreviatio for cyclodextrin or part of other abbreviation)

Only relevant DOI is

10.1016/j.microc.2021.106273

Is contained in this list because abstract was not found but does mention carbon dots in abstract

10.1016/j.ultsonch.2016.03.004 - should be dropped no mention of carbon dots but CD is in abstract


In [274]:
relevant_dois.append('10.1016/j.microc.2021.106273')
print(len(relevant_dois))

1905


In [275]:
#Creating dataset with only carbon dot related entries
relevant_data = []
for doi in relevant_dois:
    for entry in data:
        if entry['doi'] == doi:
            relevant_data.append(entry)
            break
print(len(relevant_data))

1905


In [276]:
# with open(os.path.join(path, 'relevant_data.json'), 'w', encoding='utf-8') as file:
#     json.dump(relevant_data, file, indent=4, sort_keys=True,ensure_ascii=False)

## 2. Standardizing material names
Have to remove solvents (water, ethanol) as well mention of target materials such as CDs

In [277]:
#Collecting all precursors into one file with all precursors from each entry on one line
# filename = 'all_precursors_raw_rel.txt'
# for entry in relevant_data:
#     with open(os.path.join(path, filename), 'a', encoding='utf-8') as f:
#         f.write(f"'DOI': {entry['doi']}, 'precursors': {list(entry['precursors'].keys())}")
#         f.write('\n')

In [278]:
precursors_raw = []
for entry in relevant_data:
    precursors_raw.extend(list(entry['precursors'].keys()))
print(len(precursors_raw))
precursors_set = list(set(precursors_raw))
print(len(precursors_set))

4370
1749


In [279]:
# for material in precursors_set:
#     with open(os.path.join(path, 'precursors_set.txt'), 'a', encoding = 'utf-8') as file:
#         file.write(material)
#         file.write('\n')

In [280]:
#Removing target and errors and filtering out repetitions of spelling
df = pd.read_csv(os.path.join(path, 'precursors_1.csv'))
df.head()

Unnamed: 0,chemical,tag
0,aspartic acid,
1,C96H30,
2,molasses filtrate,
3,Bi(NO)3·5H2O,
4,Na2S,


In [281]:
chemicals = df.loc[df['tag'].isnull(), 'chemical'].tolist()
chemicals_unaltered = df.loc[df['tag'].isnull(), 'chemical'].tolist()
print(len(chemicals))

1613


In [282]:
# making all chemical names start with lowercase and find duplicates (e.g. Citric acid and citric acid)
for i, chemical in enumerate(chemicals):
    try:
        if chemical[0].isupper() and chemical[1].islower() and chemical[2].islower():
            chemicals[i] = chemical.replace(chemical[0], chemical[0].lower())
    except IndexError:
        continue
chemicals_set = list(set(chemicals))
print(len(chemicals_set))
duplicates = [k for k,v in Counter(chemicals).items() if v>1]

1554


In [283]:
# Duplicate names changed to lowercase in chemical names spreadsheet
print(len(duplicates))
print(duplicates)

59
['luminol', 'ammonium citrate', 'citric acid', 'graphite', 'aluminum nitrate', 'tetrabutyl titanate', 'phosphoric acid', 'copper acetate monohydrate', 'dextrin', 'sodium citrate', 'glycerin', 'arginine', 'trisodium citrate', 'glucose', 'titanium butoxide', 'tartaric acid', 'ethanol', 'oleic acid', 'sodium thiosulfate', 'critic acid', 'urea', 'formaldehyde', 'polyethyleneimine', 'sucrose', 'ginkgo', 'manganese acetate', 'terephthalic acid', 'glycyrrhizic acid', 'sodium alginate', 'malic acid', 'dopamine', 'rutin', 'gelatin', 'ammonium oxalate', 'ethylene diamine', 'sodium acetate', 'citric acid monohydrate', 'nickel chloride hexahydrate', 'milk', 'hydrazine hydrate', 'catechol', 'folic acid', 'thioacetamide', 'zinc acetate', 'mango', 'dicyandiamide', 'ethylenediamine', 'benzophenone', 'polyacrylic acid', 'zinc acetate dihydrate', 'ascorbic acid', 'hydrogen peroxide', 'cys', 'chlorella', 'tangerine', 'acetone', 'chitosan', 'cysteine', 'melamine']


In [284]:
# Creating dictionary with precursors from relevant data
precursor_dict = []
for entry in relevant_data:
    precursor_dict.append({'DOI':entry['doi'], 'precursors': list(entry['precursors'].keys())})

print(len(precursor_dict))

1905


In [285]:
#First step is sorting out specific cases of the same abbreviation having different meanings
def replace_name(precursor_list, old_name, new_name):
    for i, value in enumerate(precursor_list):
        if value == old_name:
            precursor_list[i] = new_name

ascorbic_acid = ['10.1039/D1AY00762A', '10.1021/acs.langmuir.6b04100','10.1007/s00604-017-2526-3', '10.1002/bio.4158', 
                 '10.1039/C8RA03353F_no_0', '10.1021/ACS.JPCC.5B05786', '10.1016/j.ijbiomac.2020.01.238', '10.1039/C7AY01180F_no_0',
                 '10.1039/C7AY01180F_no_1', '10.1039/D1AN00796C', '10.1021/acsami.8b17128']
gum_tr = ['10.1007/s00289-022-04207-0', '10.1016/j.molliq.2018.03.054', '10.1016/j.optmat.2019.109356', '10.1007/s00289-020-03236-x']

for entry in precursor_dict:
    doi = entry['DOI']
    pre = entry['precursors']
    if doi == '10.1080/1536383X.2019.1587747':
        replace_name(pre,'GA', 'gluconic acid')
    if doi in ['10.1007/s10895-021-02870-6', '10.1007/s10895-020-02661-5']:
        replace_name(pre,'GA', 'gallic acid')
    if doi == '10.1007/s40097-021-00431-8_no_8':
        replace_name(pre, 'GA', 'glucosamine')
    if doi == '10.1039/C5TA05189D':
        replace_name(pre, 'AA', 'alginic acid')
    if doi in ascorbic_acid:
        replace_name(pre, 'AA', 'ascorbic acid')
    if doi == '10.3390/nano12173062_no_0':
        replace_name(pre, 'TA', 'tannic acid')
    if doi in ['10.1016/j.apsusc.2019.144567', '10.1007/s10953-016-0531-5_no_0']:
        replace_name(pre, 'TA', 'tartaric acid')
    if doi == '10.1016/j.jcis.2019.04.088':
        replace_name(pre,'TEA', 'triethylamine')
    if doi == '10.1039/C4RA14865G':
        replace_name(pre, 'TEA', 'triethanolamine')
    if doi in ['10.1007/S13399-019-00387-4_no_0', '10.1002/smll.202205065']:
        replace_name(pre, 'CS', 'chitosan')
    if doi == '10.1039/C5RA14585F':
        replace_name(pre, 'CS', 'chondroitin sulfate')
    if doi == '10.1039/D1AY00762A':
        replace_name(pre, 'DPA', 'penicillamine')
    if doi == '10.1002/smll.201700983_no_11':
        replace_name(pre, 'DPA', '9,10-diphenylanthracene')
    if doi == '10.3390/molecules27155021':
        replace_name(pre,'CB', 'carbon black')
    if doi == '10.1016/j.dyepig.2022.110101':
        replace_name(pre, 'CB', 'canon ball fruit')
    if doi in ['10.1039/C7TC01585B', '10.1021/acsomega.0c02627']:
        replace_name(pre, 'TSC', 'citric acid')
    if doi in ['10.1007/s00216-020-02629-1', '10.1002/bio.4083']:
        replace_name(pre, 'TSC', 'thiosemicarbazide')
    if doi == '10.1038/s41467-019-09830-6':
        replace_name(pre, 'MA', 'melamine')
    if doi in ['10.1016/j.apsusc.2019.144567', '10.1007/s10953-016-0531-5_no_0']:
        replace_name(pre, 'MA', 'malic acid')
    if doi == '10.1007/s10853-021-06606-6':
        replace_name(pre, 'HA', 'humic acid')
    if doi == '10.3390/pharmaceutics14112423_no_0':
        replace_name(pre, 'HA', 'hyaluronic acid')
    if doi == '10.1021/acsami.0c00283':
        replace_name(pre,'AC', 'alizarin carmine')
    if doi in ['10.1039/C4RA06757F', '10.1007/s11051-015-3294-9', '10.1007/s10934-022-01421-z']:
        replace_name(pre, 'AC', 'activated carbon')
    if doi in ['10.1021/acs.jafc.8b07176', '10.1039/C7NR05363K']:
        replace_name(pre, 'AC', 'ammonium citrate')
    if doi in gum_tr:
        replace_name(pre, 'GT', 'gum tragacanth')
    if doi in ['10.1007/s10570-021-04270-2', '10.1016/j.carbpol.2020.117387']:
        replace_name(pre, 'GT', 'glutathione')
    if doi == '10.1016/j.apsusc.2018.03.246':
        replace_name(pre, 'PA', 'phytic acid')
    if doi == '10.1039/C7RA02421E':
        replace_name(pre, 'PA', 'phthalic acid')
    if doi == '10.1039/C4TC01857E':
        replace_name(pre, 'UA', 'undecylenic acid')
    if doi in ['10.1002/smll.201804515_no_1', '10.1002/smll.201804515_no_2']:
        replace_name(pre, 'UA', 'urea')
    if doi == '10.1021/acsomega.0c03995':
        replace_name(pre, 'SA', 'alginic acid')
    if doi == '10.1007/s44211-022-00236-x_no_1':
        replace_name(pre, 'SA', 'salicylic acid')
    if doi == '10.1038/s41598-021-83863-0':
        replace_name(pre, 'SA', 'stearic acid')
    if doi == '10.1016/j.indcrop.2022.115568':
        replace_name(pre, 'OA', 'oxalic acid')
    if doi == '10.1039/C4TC01857E':
        replace_name(pre, 'OA', 'oleic acid')
    if doi == '10.1016/j.jmst.2019.03.039':
        replace_name(pre, 'AMP', 'carbon pitch')
    if doi in ['10.1021/acs.analchem.7b01053', '10.1007/s00604-016-2039-5']:
        replace_name(pre, 'AMP', 'adenosine 5′-monophosphate')
    if doi == '10.1007/s00216-019-02293-0':
        replace_name(pre, 'AMP', 'ampicillin')


In [286]:
pre = []
for entry in precursor_dict:
    pre.extend(entry['precursors'])
print(len(list(set(pre))))

1746


In [287]:
# Importing chemical names spreadsheet
df = pd.read_csv(os.path.join(path, 'chemical_names.csv'), encoding='utf-8')
print(df.loc[df['chemical'] == 'MBA', 'name'].values)
df.head()

['N,N′-methylenebisacrylamide']


Unnamed: 0,chemical,tag,name,category
0,aspartic acid,,aspartic acid,ORG
1,C96H30,,superphenalene,ORG
2,molasses filtrate,,,BIO
3,Bi(NO)3·5H2O,,,IN
4,Na2S,,,IN


In [288]:
# Getting list of chemical names to be removed
bad_entries = df.loc[((df['tag'] == 'DEL') | (df['tag'] == 'TARGET')), 'chemical'].tolist()
print(bad_entries[:10])
print(len(bad_entries))

['WAC', 'P-CQDs-2', 'CD-B', 'water‐phase', 'b-CQDs', 'CDYel', 'CDs-PEG-CC', 'EO20PO70EO20', 'HCD', 'P-CQDs']
223


In [289]:
#Removing bad entries from precursors
def remove_elements(original_list, remove_list):
    return [element for element in original_list if element not in remove_list]

for entry in precursor_dict:
    clean_precursors = remove_elements(entry['precursors'], bad_entries)
    entry['precursors'] = clean_precursors

In [290]:
#Checking if removal worked
for entry in precursor_dict:
    if entry['DOI'] == '10.1021/acs.jafc.8b07176':
        print(entry)
        break
new_pre = []
for entry in precursor_dict:
    new_pre.extend(entry['precursors'])
print(len(list(set(new_pre))))
#length should be len before deletion minus 223

{'DOI': '10.1021/acs.jafc.8b07176', 'precursors': ['-cysteine', 'ammonium citrate', 'Cys', 'ammonium citrate']}
1523


In [291]:
# Getting all organic and amino acid chemical names
organics_amino = df.loc[((df['category'] == 'ORG') | (df['category'] == 'AMINO')), 'chemical'].tolist()
print(len(organics_amino))

873


In [292]:
# Creating dataset of precursors that only contain organic and amino chemicals
def keep_elements(original_list, keep_list):
    return[element for element in original_list if element in keep_list]

org_amino_precursor_dict = []
for entry in precursor_dict:
    org_precursors = keep_elements(entry['precursors'], organics_amino)
    org_amino_precursor_dict.append({'DOI':entry['DOI'], 'precursors': org_precursors})
print(org_amino_precursor_dict[4])
print(len(org_amino_precursor_dict))

{'DOI': '10.1002/pat.4901_no_0', 'precursors': ['AEAPMS', 'CA']}
1905


In [293]:
# Normalizing all precursor names and removing duplicates
def find_names(dframe, original_list):
    new_list = []
    for element in original_list:
        new_list.extend(dframe.loc[dframe['chemical'] == element, 'name'].tolist())
    return list(set(new_list))

for entry in org_amino_precursor_dict:
    clean_precursors = find_names(df, entry['precursors'])
    entry['precursors'] = clean_precursors

    

In [294]:
print(org_amino_precursor_dict[4])
print(org_amino_precursor_dict[135:140])

{'DOI': '10.1002/pat.4901_no_0', 'precursors': ['citric acid', 'N-(2-aminoethyl)-3-aminopropyldimethoxysilane']}
[{'DOI': '10.1039/C6RA20534H', 'precursors': []}, {'DOI': '10.1016/j.cej.2016.04.123_no_1', 'precursors': ['PEI']}, {'DOI': '10.1039/D0RA10144C_no_0', 'precursors': ['glucose']}, {'DOI': '10.1016/j.snb.2017.04.092', 'precursors': ['melamine', 'citric acid']}, {'DOI': '10.1016/j.inoche.2022.110041', 'precursors': []}]


In [295]:
# Converting to dataframe and then csv file for later analysis
org_amino_df = pd.DataFrame(columns = ['DOI', 'precursors'])
for entry in org_amino_precursor_dict:
    new_df = pd.DataFrame([entry])
    org_amino_df = pd.concat([org_amino_df, new_df], axis=0, ignore_index=True)
org_amino_df.head(10)

Unnamed: 0,DOI,precursors
0,10.1039/D1FO03426J,[]
1,10.1002/cssc.201700474,[citric acid]
2,10.1007/s13399-020-00839-2_no_0,[ethanol]
3,10.1021/acsami.1c07260,[calcein]
4,10.1002/pat.4901_no_0,"[citric acid, N-(2-aminoethyl)-3-aminopropyldi..."
5,10.1002/pat.4901_no_1,"[citric acid, N-(2-aminoethyl)-3-aminopropyldi..."
6,10.1016/j.apcatb.2018.03.027_no_1,"[citric acid, thiourea]"
7,10.1039/D0NR09131F,"[p-phenylenediamine, ascorbic acid]"
8,10.1039/C6NR06558A,[]
9,10.1039/D1AY00762A,"[penicillamine, ascorbic acid]"


In [296]:
#Saving to csv
org_amino_df.to_csv(os.path.join(path,'org_amino_pre.csv'), columns = ['DOI', 'precursors'], encoding='utf-8', index = False)

In [297]:
print(org_amino_df.loc[org_amino_df['DOI']=='10.1021/ACSOMEGA.8B03191', 'precursors'].values)

[list(['2,2′-(ethylenedioxy)-bis(ethylamine)', 'quinine sulfate', 'malic acid'])]


## 3. Standardizing reaction time and temperature

In [46]:
with open(os.path.join(path, 'relevant_data.json'), 'r', encoding='utf-8') as file:
    relevant_data = json.load(file)

In [39]:
temp_units = []
time_units = []
for entry in relevant_data:
    if not entry['heating_operations']:
        continue
    else:
        for element in entry['heating_operations']:
            if 'temp_values' in element:
                temp_units.append(element['temp_values']['units'])
            if 'time_values' in element:
                time_units.append(element['time_values']['units'])
print(list(set(temp_units)))
print(list(set(time_units)))

['K', None, '°C', 'C']
['h', 'hrs', 'hr', None, 'day', 'min', 'hour', 'hours', 'minutes']


In [47]:
# Creating subset of data containing only doi and heating operations
heating_dict = []
for entry in relevant_data:
    heating_dict.append({'DOI':entry['doi'], 'heating_op': entry['heating_operations']})
print(len(heating_dict))
for entry in heating_dict:
    if entry['DOI'] == '10.1021/acs.inorgchem.0c01243_no_0':
        print(entry)
for entry in relevant_data:
    if entry['doi'] == '10.1021/acs.inorgchem.0c01243_no_0':
        print(entry)      

1905
{'DOI': '10.1021/acs.inorgchem.0c01243_no_0', 'heating_op': [{'subject': 'the white emulsion', 'temp_values': {'max': 180.0, 'min': 180.0, 'units': '°C', 'values': [180.0]}, 'time_values': {'max': 7.0, 'min': 7.0, 'units': 'day', 'values': [7.0]}}]}
{'all_materials': {'H2O': None, 'H3BO3': ['7', 'mmol'], 'N,N-dimethylformamide': ['1', 'mL', '1', 'mL'], 'N-ethyl piperazine(1': None, 'Teflon': ['25', 'mL'], 'Zn(OH)2': None, 'ZnO': None, 'air': ['45.51', '%'], 'en': ['2', 'mL'], 'ethylenediamine': None}, 'doi': '10.1021/acs.inorgchem.0c01243_no_0', 'heating_operations': [{'subject': 'the white emulsion', 'temp_values': {'max': 180.0, 'min': 180.0, 'units': '°C', 'values': [180.0]}, 'time_values': {'max': 7.0, 'min': 7.0, 'units': 'day', 'values': [7.0]}}], 'paragraph': ['White block crystals of 1 were prepared via the so', '...', ' 11.91 wt %; Found: C 10.86, H 2.63, N 12.57 wt %.'], 'precursors': {'H3BO3': ['7', 'mmol'], 'N,N-dimethylformamide': ['1', 'mL', '1', 'mL'], 'N-ethyl pipe

In [42]:
# Functions to convert different units to celsius and hours
def kelvin_to_cel(value_list):
    new_values = []
    for value in value_list:
        new_values.append(value-273)
    return new_values

def day_to_hr(value_list):
    new_values = []
    for value in value_list:
        new_values.append(value*24)
    return new_values

def min_to_hr(value_list):
    new_values = [] 
    for value in value_list:
        new_values.append(round(value/60,2))
    return new_values

In [49]:
# Converting values to celsius and hours
for entry in heating_dict:
    if not entry['heating_op']:
        continue
    else:
        for element in entry['heating_op']:
            if 'temp_values' in element:
                if element['temp_values']['units'] is None:
                    continue
                elif element['temp_values']['units'] == 'K':
                    new_values = kelvin_to_cel(element['temp_values']['values'])
                    new_max = max(new_values)
                    new_min = min(new_values)
                    element['temp_values']['values'] = new_values
                    element['temp_values']['max'] = new_max
                    element['temp_values']['min'] = new_min
                    element['temp_values']['units'] = '°C'
            if 'time_values' in element:
                if element['time_values']['units'] is None:
                    continue
                elif element['time_values']['units'] in ['minutes', 'min']:
                    new_values = min_to_hr(element['time_values']['values'])
                    new_max = max(new_values)
                    new_min = min(new_values)
                    element['time_values']['values'] = new_values
                    element['time_values']['max'] = new_max
                    element['time_values']['min'] = new_min
                    element['time_values']['units'] = 'h'
                elif element['time_values']['units'] == 'day':
                    new_values = day_to_hr(element['time_values']['values'])
                    new_max = max(new_values)
                    new_min = min(new_values)
                    element['time_values']['values'] = new_values
                    element['time_values']['max'] = new_max
                    element['time_values']['min'] = new_min
                    element['time_values']['units'] = 'h'

for entry in heating_dict:
    if entry['DOI'] == '10.1039/D2RA01911F_no_1':
        print(entry)

{'DOI': '10.1039/D2RA01911F_no_1', 'heating_op': [{'subject': 'This solution', 'temp_values': {'max': None, 'min': None, 'units': None, 'values': None}, 'time_values': {'max': None, 'min': None, 'units': None, 'values': None}}, {'subject': 'Both the solutions', 'temp_values': {'max': None, 'min': None, 'units': None, 'values': None}, 'time_values': {'max': None, 'min': None, 'units': None, 'values': None}}, {'subject': 'The reaction', 'temp_values': {'max': 200.0, 'min': 200.0, 'units': '°C', 'values': [200.0]}, 'time_values': {'max': 0.5, 'min': 0.5, 'units': 'h', 'values': [0.5]}}]}


In [51]:
#Simplifying data structure
heating_dict_2 = []
for entry in heating_dict:
    if not entry['heating_op']:
        heating_dict_2.append({'DOI':entry['DOI'], 'rxn_temp_time': None})
    else:
        rxn_temp_time = []
        for element in entry['heating_op']:
            rxn_temp = element['temp_values']['values']
            rxn_time = element['time_values']['values']
            rxn_temp_time.append({'rxn_temp': rxn_temp, 'rxn_time': rxn_time})
        heating_dict_2.append({'DOI':entry['DOI'], 'rxn_temp_time':rxn_temp_time})
print(len(heating_dict_2))
for entry in heating_dict_2:
    if entry['DOI'] == '10.1039/D2RA01911F_no_1':#
        print(entry)
            

1905
{'DOI': '10.1039/D2RA01911F_no_1', 'rxn_temp_time': [{'rxn_temp': None, 'rxn_time': None}, {'rxn_temp': None, 'rxn_time': None}, {'rxn_temp': [200.0], 'rxn_time': [0.5]}]}


In [53]:
# Saving to dataframe and subsequently
rxn_temp_time_df = pd.DataFrame(columns=['DOI', 'rxn_temp_time'])
for entry in heating_dict_2:
    new_df = pd.DataFrame([entry])
    rxn_temp_time_df = pd.concat([rxn_temp_time_df, new_df], axis=0, ignore_index=True)
rxn_temp_time_df.head(10)

Unnamed: 0,DOI,rxn_temp_time
0,10.1039/D1FO03426J,"[{'rxn_temp': [200.0], 'rxn_time': [8.0]}, {'r..."
1,10.1002/cssc.201700474,"[{'rxn_temp': [10.0], 'rxn_time': [3.0]}, {'rx..."
2,10.1007/s13399-020-00839-2_no_0,"[{'rxn_temp': [200.0], 'rxn_time': [24.0]}]"
3,10.1021/acsami.1c07260,"[{'rxn_temp': [180.0], 'rxn_time': None}, {'rx..."
4,10.1002/pat.4901_no_0,"[{'rxn_temp': [80.0], 'rxn_time': [4.0]}]"
5,10.1002/pat.4901_no_1,
6,10.1016/j.apcatb.2018.03.027_no_1,
7,10.1039/D0NR09131F,"[{'rxn_temp': [200.0], 'rxn_time': [2.0]}]"
8,10.1039/C6NR06558A,"[{'rxn_temp': [200.0], 'rxn_time': None}, {'rx..."
9,10.1039/D1AY00762A,"[{'rxn_temp': [180.0], 'rxn_time': [4.0]}]"


In [54]:
#Saving to csv
# rxn_temp_time_df.to_csv(os.path.join(path,'rxn_temp_time.csv'))