# Предобработка второй бд

In [58]:
import pandas as pd
import numpy as np
import chemparse
from pymatgen.core.composition import Composition

data2 = pd.read_excel('data/Database_2.xlsx')

### Отбрасываем признаки

In [59]:
# drop_cols = [
#     'Animal?', 'Test indicator', 'Biochemical metric', 
#     'Publication year', 'Particle ID', 'Reference DOI', 
#     'Interference checked (Y/N)', 'Colloidal stability checked (Y/N)', 'Positive control (Y/N)',
#     'Cell age: embryonic (E), Adult (A)', 'Cell-organ/tissue source'
# ]
# data2 = data2.drop(columns = drop_cols)

### Переименовываем признаки

In [60]:
data2.rename(
    columns = {
        'Nanoparticle': 'material_type', 
        'Concentration μM': 'concentration',
        '% Cell viability' : 'viability',
        'Diameter (nm)': 'diameter',
        'Zeta potential (mV)': 'surface_charge', 
        'Cells': 'cell_line', 
        'Cell line (L)/primary cells (P)': 'cell_line_bin', 
        'Human(H)/Animal(A) cells': 'human/animal',
        'Cell morphology': 'cell_morphology', 
        'Exposure time (h)': 'time', 
        'Test': 'test', 
        'Type: Organic (O)/inorganic (I)': 'type',
        'Animal?': 'animal',
        'Test indicator': 'test_indicator', 
        'Biochemical metric': 'biochemical_metrics', 
        'Publication year': 'year', 
        'Particle ID': 'particle_id', 
        'Reference DOI': 'reference_doi', 
        'Interference checked (Y/N)': 'interference_test', 
        'Colloidal stability checked (Y/N)': 'colloidal_stability', 
        'Positive control (Y/N)': 'positive_control',
        'Cell age: embryonic (E), Adult (A)': 'cell_age', 
        'Cell-organ/tissue source': 'cell_organ',
        }, 
        inplace = True)

### Переименуем соединения

In [61]:
mt_dict = {
    'Copper Oxide': 'CuO',
    'Zinc oxide': 'ZnO', 
    'Iron oxide': 'Fe2O3', 
    'Hydroxyapatite': 'Ca10(PO4)6(OH)2'
    }
data2['material_type'].replace(mt_dict, inplace = True)

### Заполним пропуски

In [62]:
data2.loc[21, 'cell_morphology'] = 'Endothelial'
data2.loc[98, 'cell_age'] = 'A'
data2.loc[31, 'test'] = 'MTT'
data2.loc[106, 'test_indicator'] = 'toluylene red'
data2.loc[69, 'biochemical_metrics'] = 'cell membrane integrity'

### Добавим молярную массу

In [63]:
c_lst = ['Carbon Nanotubes', 'Carbon NP', 'QDs']

data2.loc[data2.material_type.isin(c_lst), 'molecular_weight'] = 12.01
data2.loc[data2.material_type == 'Eudragit RL', 'molecular_weight'] = 231.29
data2.loc[data2.material_type == 'Dendrimer', 'molecular_weight'] = 516.7
data2.loc[data2.material_type == 'PLGA', 'molecular_weight'] = 148.11
data2.loc[data2.material_type == 'Polystyrene', 'molecular_weight'] = 104.1
data2.loc[data2.material_type == 'Liposomes', 'molecular_weight'] = 938.1
# data2.loc[data2.material_type == 'Carbon Nanotubes', 'molecular_weight'] = 12.01
# data2.loc[data2.material_type == 'Carbon NP', 'molecular_weight'] = 12.01
data2.loc[data2.material_type == 'Chitosan', 'molecular_weight'] = 501.5
# data2.loc[data2.material_type == 'QDs', 'molecular_weight'] = 12.01
data2.loc[data2.material_type == 'SLN', 'molecular_weight'] = data2.molecular_weight.mean()

In [64]:
without_formula = [
    'QDs', 'Eudragit RL', 
    'Dendrimer', 'PLGA', 
    'Polystyrene', 'Liposomes', 
    'SLN', 'Carbon Nanotubes', 
    'Carbon NP', 'Chitosan'
    ]

In [65]:
data2['molecular_weight'] = data2['material_type'].apply(lambda x: Composition(chemparse.parse_formula(x)).weight if x not in without_formula else np.nan)

### Добавим электроотрицательность

In [66]:
data2['electronegativity'] = data2['material_type'].apply(lambda x: Composition(chemparse.parse_formula(x)).average_electroneg if x not in without_formula else np.nan)

### Сохраняем

In [67]:
data2.to_excel('data/data2_prep.xlsx', index=False)