In [1]:
import numpy as np
import pandas as pd

### Functions to help filter data

In [None]:
# Helper functions to help in data cleaning process

def get_sample_idxs(dataframe, features, values):
    
    condition = (dataframe.loc[:, features] == np.array(values)).all(axis=1)
    sample_idxs = dataframe[condition].index
    
    return sample_idxs

def drop_empty_samples(dataframe, **kwargs):
    
    for feature in kwargs['drop_empty_samples_from']:
        dataframe[feature].replace('', np.nan, inplace=True)
        dataframe = dataframe.dropna(subset=feature)
    
    return dataframe

# Note : Must provide samples to drop in list format
def drop_samples(dataframe, **kwargs):
    
    print('\n')
    
    for i,feature in enumerate(kwargs['drop_samples_from']):
        
        for sample in kwargs['drop_samples'][i]:

            drop_idx = dataframe[dataframe[feature] == sample].index
            num_samples_dropped = len(drop_idx)
            if kwargs['suppress_output'] == False:
                print(f' --> Dropping sample {sample}')
                print(f' --> Number of samples dropped {num_samples_dropped}')
            dataframe = dataframe.drop(axis=0, index=drop_idx)
    
    return dataframe

def drop_samples_by_index(dataframe, sample_idxs):
    
    dataframe = dataframe.drop(axis=0, index=sample_idxs)
    
    return dataframe

def drop_samples_on_count(dataframe, features, count_cutoff, suppress_output):
    
    for feature in features:
        samples = list(dataframe[feature].value_counts().index)
        sample_counts = list(dataframe[feature].value_counts().values)
        
        # Another way to do this operation 
        # idx = list(filter(lambda i: i if sample_counts[i] < drop_sample_with_count else continue, \
        #                    range(len(sample_counts))))[0]
        idx = next(idx for idx, sample_count in enumerate(sample_counts) \
                   if sample_count <= count_cutoff)
        
        samples_to_drop = samples[idx:]
            
        dataframe = drop_samples(dataframe, feature, samples_to_drop, suppress_output)
            
    return dataframe

def drop_samples_on_criterion(dataframe, **kwargs):
    
    print(' ----> Dropping samples based on criterion.')
    
    values = dataframe[kwargs['drop_samples_on_criterion_from']].values
    
    for sample in zip(values):
        
        sample = sample[0]

        if kwargs['drop_criterion'](sample):
            # If sample satisfies drop criteria locate all such samples by getting their sample indices
            sample_idxs = get_sample_idxs(dataframe, 
                                          kwargs['drop_samples_on_criterion_from'], 
                                          sample)
            # Drop samples that pass the criteria
            dataframe = drop_samples_by_index(dataframe, sample_idxs)
            if kwargs['suppress_output'] == False:
                print(f' --> Drop sample {sample}')
                print(f' --> Number of samples dropped {len(sample_idxs)}')
        else:
            continue
            
    print(f' ----> Perovskites 3D shape {perovskites_3D.shape}')
    
    return dataframe

def split_column_based_on_delimiter(dataframe, **kwargs):
    
    # Split the column
    split_df = dataframe[kwargs['split_col']].str.split(kwargs['delimiter'], expand=True).fillna(value=kwargs['fill_value'])
    col_idx = list(dataframe).index(kwargs['split_col'])
    dataframe = pd.concat([dataframe.iloc[:,:(col_idx) + 1], \
                           split_df, \
                           dataframe.iloc[:,(col_idx + 1):]], axis=1)
    
    # Rename the columns
    for i, new_col_name in enumerate(kwargs['new_col_names']):
        dataframe = dataframe.rename(columns={i:new_col_name})
    
    return dataframe

def insert_features(dataframe, col_idx, col_name, col_data):
    
    dataframe.insert(col_idx, col_name, col_data)
    
    return dataframe

def insert_properties(dataframe, **kwargs):
        
    # Get the array of ion coefficients
    ions_coeffs_array = dataframe[kwargs['ion_coeff_cols']].values.astype(np.float32)
    
    # Get the array of ions
    ions_array = np.char.strip(dataframe[kwargs['ion_cols']].values.astype('U4'))
    
    # Iterate through each property dictionary
    for i, prop_dict in enumerate(kwargs['prop_dict_list']):
    
        # Create a property arraya to store the ion properties
        ions_prop_array = np.zeros(ions_array.shape, dtype=np.float32)
    
        # Replace ions in the list with prop value 
        for ion in list(prop_dict.keys())[1:]:
            np.place(ions_prop_array, ions_array==ion, prop_dict[ion])
    
        # Multiply prop value with coefficent and reduce along axis=1
        col_data = np.sum(np.multiply(ions_prop_array, ions_coeffs_array), axis=1)
    
        dataframe = insert_features(dataframe, i+1+kwargs['insert_col_loc'], prop_dict['prop'], col_data)
    
    return dataframe

# This function takes as input all the dataframe processing functions
def dataframe_clean_pipeline(*funcs):
    
    # This function takes in the all the arguments for the dataframe processing functions
    def func_args(dataframe, **kwargs):
        for func in funcs:
            dataframe = func(dataframe, **kwargs)
        return dataframe
    
    return func_args

def get_feature_stats(dataframe, col):
    print(dataframe[col].value_counts())

### Filter for 3D perovskites 

In [None]:
# 
# Screening for only 3D perovskites
#

dataset_dir = 'original_dataset.csv'
dataframe = pd.read_csv(dataset_dir, header=0, low_memory=False)
print('Initial shape')
print(f' --> Perovskites 3D shape {dataframe.shape}')

# Module is defined as a Cell composed of connected individual sub cells ?
module = False
perovskite_single_crystal = False
# Perovskite 0D is perovskite quantum dots
perovskite_0D = False
# Perovskite 2D is layered perovskite with large A site cation 
perovskite_2D = False
# Perovskite 2D3D mixture is reduced dimensional perovskite but not pure 2d PEROVSKITE
perovskite_2D3D = False
# Perovskite 3D is standard 3D perovskite with ABC3 structure
perovskite_3D = True
# Perovskite 3D with 2D capping layer is where bulk of perovskite is 3D but consists of a small 2D capping layer on top 
perovskite_3D_with_2D_capping_layer = False
# Perovskite with ABC3 structure 
perovskite_with_ABC3_structure = True
# Some cells can be perovskite inspired but not actually perovskites
perovskite_inspired = False
# Dimensionality of perovskite should be 3
perovskite_dim = '3'

# Only select those entries which satisify constraints provided
perovskites_3D = dataframe[(dataframe['Module'] == module) &
                           (dataframe['Perovskite_single_crystal'] == perovskite_single_crystal) &
                           (dataframe['Perovskite_dimension_0D'] == perovskite_0D) &
                           (dataframe['Perovskite_dimension_2D'] == perovskite_2D) &
                           (dataframe['Perovskite_dimension_2D3D_mixture'] == perovskite_2D3D) &
                           (dataframe['Perovskite_dimension_3D'] == perovskite_3D) &  
                           (dataframe['Perovskite_dimension_3D_with_2D_capping_layer'] == perovskite_3D_with_2D_capping_layer) &
                           (dataframe['Perovskite_composition_perovskite_ABC3_structure'] == perovskite_with_ABC3_structure) &
                           (dataframe['Perovskite_composition_perovskite_inspired_structure'] == perovskite_inspired) &
                           (dataframe['Perovskite_dimension_list_of_layers'] == perovskite_dim)]

print('Choosing only 3D perovskites')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')

In [None]:
perovskites_3D['Perovskite_deposition_quenching_media_additives_compounds'].value_counts()

In [None]:
# Store the 3D perovskite, deposition, substrate, ETM, HTM and JV measurements in a new workbook

#
# Selecting the perovskite variables for the dataset
#

perovskite_variables = ['Perovskite_single_crystal',
                        'Perovskite_dimension_0D',
                        'Perovskite_dimension_2D',
                        'Perovskite_dimension_2D3D_mixture',
                        'Perovskite_dimension_3D',
                        'Perovskite_dimension_3D_with_2D_capping_layer',
                        'Perovskite_dimension_list_of_layers',
                        'Perovskite_composition_perovskite_ABC3_structure',
                        'Perovskite_composition_perovskite_inspired_structure',
                        'Perovskite_composition_long_form',
                        'Perovskite_composition_a_ions',
                        'Perovskite_composition_a_ions_coefficients',
                        'Perovskite_composition_b_ions',
                        'Perovskite_composition_b_ions_coefficients',
                        'Perovskite_composition_c_ions',
                        'Perovskite_composition_c_ions_coefficients',
                        'Perovskite_band_gap']

#
# Selecting the deposition variables for the dataset
#

# Solvent annealing time and temperature are available not available for most samples. Data for around only 100 samples.
# Quenching media volume, additives not available for most samples. Data for around only 100 samples

deposition_variables = ['Perovskite_deposition_number_of_deposition_steps',
                        'Perovskite_deposition_procedure',
                        'Perovskite_deposition_solvents',
                        'Perovskite_deposition_solvents_mixing_ratios',
                        'Perovskite_deposition_quenching_induced_crystallisation',
                        'Perovskite_deposition_quenching_media',
                        'Perovskite_deposition_thermal_annealing_temperature',
                        'Perovskite_deposition_thermal_annealing_time',
                        'Perovskite_deposition_solvent_annealing',
                        'Perovskite_deposition_solvent_annealing_timing']
#
# Selecting the substrate variables for the dataset
#

# Assunmptions :
# 1. Supplier and brand name does not affect the quality of product
# 2. Dropping Substrate thickness not mentioned in majority of papers 
# 3. Dropping Substrate area mentioned for only 306 data points
# 4. Dropping Substrate deposition is unknown for 39010 data points
# 5. Dropping Substarte surface roughness as it is an empty column
# 6. Dropping Substarte etching procedure as only mentioned for 540 data points
# 7. Dropping Substrate cleaning procedure as mentioned as text string.

'''
substrate_features_to_drop = ['Substrate_supplier',              
                              'Substrate_brand_name',
                              'Substrate_thickness',
                              'Substrate_area',
                              'Substrate_deposition_procedure',
                              'Substrate_surface_roughness_rms',
                              'Substrate_etching_procedure',
                              'Substrate_cleaning_procedure']

dataframe = drop_column(dataframe, substrate_features_to_drop)
print('Dropping substrate features')
print(f' --> Perovskites 3D shape {dataframe.shape}')
'''

substrate_variables = ['Substrate_stack_sequence']

#
# Selecting the ETM variables for the dataset
#

etm_variables = ['ETL_stack_sequence']

#
# Selecting the ETM variables for the dataset
#

htm_variables = ['HTL_stack_sequence']

#
# Selecting the JV properties
#

JV_variables = ['JV_average_over_n_number_of_cells',
                'JV_test_atmosphere',
                'JV_light_intensity',
                'JV_light_spectra',
                'JV_light_masked_cell',
                'JV_test_temperature',
                'JV_scan_speed',
                'JV_reverse_scan_Voc',
                'JV_reverse_scan_Jsc',
                'JV_reverse_scan_FF',
                'JV_reverse_scan_PCE']

keep_cols = []
keep_cols.extend(perovskite_variables)
keep_cols.extend(deposition_variables)
keep_cols.extend(substrate_variables)
keep_cols.extend(etm_variables)
keep_cols.extend(htm_variables)
keep_cols.extend(JV_variables)

perovskites_3D = perovskites_3D.loc[:, keep_cols]
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')

perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

### Filter for Perovskite properties

In [None]:
# Load the dataset 

dataset_dir = 'PSC_efficiencies_dataset.csv'
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)
print('Initial shape')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')

# Filter samples which have band gap data
filter_for_band_gap = False
filter_for_pl_max = False
filter_for_fill_factor = True

if filter_for_band_gap:
    
    # Drop empty samples
    perovskites_3D = drop_empty_samples(perovskites_3D, 
                                        drop_empty_samples_from=['Perovskite_band_gap'])
    
    # Drop samples which do not have a estimation basis provided
    perovskites_3D = drop_empty_samples(perovskites_3D, 
                                        drop_empty_samples_from=['Perovskite_band_gap_estimation_basis'])

    # Drop sample with band_gap_graded = True
    # Removing samples whose band gap varies as a vertical position in the perovskite layer.
    samples_to_drop = [[True]]
    perovskites_3D = drop_samples(perovskites_3D,\
                                  drop_samples_from=['Perovskite_band_gap_graded'],
                                  drop_samples=samples_to_drop,
                                  suppress_output=True)
    
    print('Choosing only those samples for which band gap information was recorded.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('3D_perovskites.csv', header=True, index=False)
    
# Filter for samples which have steady state pl max data.
if filter_for_pl_max:

    perovskites_3D = drop_empty_samples(perovskites_3D, ['Perovskite_pl_max'])
    
    print('Choosing only those samples for which steady state pl max information was recorded.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('3D_perovskites.csv', header=True, index=False)
    
if filter_for_fill_factor:
    
    #######################################################################################################
    
    # For analysis we consider all JV measurements done at
    # 1. Temp : 25 C
    # 2. Light intensity : 100 mW/cm2
    # 3. Light spectra : AM1.5
    # 4. Light not passed through a mask
    
    # Feature : JV_average_over_n_number_of_cells (SKIP)
    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_average_over_n_number_of_cells'])
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_average_over_n_number_of_cells'] == 1]
    
    print('\n')
    print(f' --> Dropped samples with number of cells not equal to 1')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('3D_perovskites.csv', header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_certified_values (SKIP)
    # Value : True/False
    # Note : Only 75 samples have certified JV values
    
    # Feature : JV_certified_institute (SKIP)
    
    # Feature : JV_storage_age_of_cell (SKIP)
    # Note : Only around 3o samples have mentioned storage age of cell
    
    # Feature : JV_storage_atmosphere (SKIP)
    # Note : Only 52 samples have mentioned storage conditions
    
    # Feature : JV_storage_relative_humidity (SKIP)
    # Note : Only 16 samples have mentioned humidty conditions
    
    #######################################################################################################
    
    # Feature : JV_test_atmosphere (USE)
    # Note : Atmosphere in which JV measurement is performed
    # Note : Remove all samples with 'Unknown' - 21929 ; 'Vacuum' - 33 ; 'Near-space' - 4 ; 'Water' - 1
    # Ambient - Relative humidity unknown
    # Dry air - Low realtive humidity
    
    perovskites_3D = perovskites_3D[(perovskites_3D['JV_test_atmosphere'] == 'Air') | 
                                    (perovskites_3D['JV_test_atmosphere'] == 'N2') | 
                                    (perovskites_3D['JV_test_atmosphere'] == 'Ambient') |
                                    (perovskites_3D['JV_test_atmosphere'] == 'Dry air')]
    
    
    # Insert another column where the categories are just Air and  N2
    insert_col_loc = list(perovskites_3D.columns).index('JV_test_atmosphere')
    perovskites_3D.insert(insert_col_loc, 'Air or N2', perovskites_3D['JV_test_atmosphere'].apply(lambda x: 'Air' if ((x == 'Air') or
                                                                                                           (x == 'Ambient') or 
                                                                                                           (x == 'Dry air')) else 'N2'))
    
    print('\n')
    print(' --> Dropped samples without JV test atmosphere conditions Unknown, Vacuum, Near space or water.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_test_relative_humidity (SKIP)
    # Note : Only 268 samples have relative humidity specified
    
    #######################################################################################################
    
    # Feature : JV_test_temperature (SKIP)
    # Note : Assume standard room temperature of 25C if temperature is not controlled or not known
    
    test_temps_exclude = list(perovskites_3D['JV_test_temperature'].value_counts().index)
    test_temps_exclude.remove(25)
    
    perovskites_3D = perovskites_3D[~perovskites_3D['JV_test_temperature'].isin(test_temps_exclude)]
    
    print('\n')
    print(' --> Dropped samples where temperature is not 25 C')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_light_source_type (SKIP)
    # Note : Only provided for around 360 samples
    
    # Feature : JV_light_source_brand_name (SKIP)
    
    # Feature : JV_light_source_simulator_class (SKIP)
    # Note :
    
    #######################################################################################################
    
    # Feature : JV_light_intensity (SKIP)
    # Note : Keep only 100 mW/cm2
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_light_intensity'] == 100]
    
    print('\n')
    print(' --> Dropped samples where light spectra is not 100 mW/cm2.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    
    #######################################################################################################

    # Feature : JV_light_spectra (SKIP)
    # Note : Keep only AM 1.5 
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_light_spectra'] == 'AM 1.5']
    
    print('\n')
    print(' --> Dropped samples where light spectra is not AM 1.5.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_light_wavelength_range (SKIP)
    
    # Feature : 'JV_light_illumination_direction' (SKIP)
    
    #######################################################################################################
    
    # Feature : JV_light_masked_cell
    # Note :  TRUE if the cell is illuminated trough a mask with an opening that is smaller than the total cell area
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_light_masked_cell'] == False]
    
    print('\n')
    print(' --> Dropped samples where light passed through a mask.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    
    #######################################################################################################
    
    '''
    # Feature : JV_scan_speed
    # Note : The speed of the potential sweep during the IV measurement
    
    scan_speeds = list(perovskites_3D['JV_scan_speed'].value_counts().index)
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_scan_speed'].isin(scan_speeds)]
    
    print('\n')
    print(' --> Dropped samples where scan speed is not mentioned.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    '''
    
    #######################################################################################################
    
    # Keep samples that have reverse FF calculated
    # Whe scan direction not stated considered to be in the reverse direction
    
    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_reverse_scan_FF'])
    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_reverse_scan_Jsc'])
    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_reverse_scan_Voc'])
    
    print('\n')
    print(' --> Dropped samples without reverse scan FF, Jss and Voc data')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
    
    #######################################################################################################
    
    '''
    # Drop samples which do not have hysteresis index
    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_hysteresis_index'])
    
    print('\n')
    print(' --> Dropped samples without JV hysteresis index')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('3D_perovskites.csv', header=True, index=True)
    '''

### Perovskite Element Properties

In [None]:
# Property dictionaries

# 1. Atomic radius (All correct)

A_ion_radius_dict = {'prop':'A_ion_radius',
                     'MA':2.17, 
                     'FA':2.53,
                     'Cs':1.74,
                     'Rb':1.61,
                     'K':1.51}

B_ion_radius_dict = {'prop':'B_ion_radius',
                     'Ca':1.00,
                     'Sr':1.26,
                     'Ba':1.42,
                     'Ge':0.87, 
                     'Sn':1.15,
                     'Pb':1.19}

C_ion_radius_dict = {'prop':'C_ion_radius',
                     'Cl':1.81,
                     'Br':1.96,
                     'I':2.20}

# 2. Atomic weight (All correct)

A_atomic_wt_dict = {'prop':'A_atom_wt',
                    'MA':32.07, 
                    'FA':45.06,
                    'Cs':132.91, 
                    'Rb':85.47, 
                    'K':39.10} 

B_atomic_wt_dict = {'prop':'B_atom_wt',
                    'Ca': 40.078,
                    'Sr': 87.62,
                    'Ba': 137.33,
                    'Ge': 72.63,
                    'Sn': 118.71,
                    'Pb': 207.2}

C_atomic_wt_dict = {'prop':'C_atom_wt',
                    'Cl': 35.45,
                    'Br': 79.90,
                    'I': 126.90}

# 3. EA (All correct)

A_EA_dict = {'prop':'A_EA',
             'FA':8.60,
             'MA':8.68,
             'Cs':45.49,
             'Rb':46.89,
             'K':48.36}

B_EA_dict = {'prop':'B_EA',
             'Ca':-156,
             'Sr':-168,
             'Ba':-52,
             'Ge':120,
             'Sn':121,
             'Pb':110}

C_EA_dict = {'prop':'C_EA',
             'Cl':348.8, 
             'Br':324.6,
             'I':295.3}


# 4. IE (All correct)

A_IE_dict = {'prop':'A_IE',
             'FA':849.10,
             'MA':926.30,
             'Cs':375.70,
             'Rb':403,
             'K':418.80}

B_IE_dict = {'prop':'B_IE',
             'Ca':589.9,
             'Sr':549.5,
             'Ba':502.9,
             'Ge':762,
             'Sn':708.6,
             'Pb':715.6}

C_IE_dict = {'prop':'C_IE',
             'Cl':1251.20, 
             'Br':1139.90,
             'I':1008.40}

# 5. EN (All correct)

A_EN_dict = {'prop':'A_EN',
             'MA':2.55,
             'FA':2.57,
             'Cs':0.79,
             'Rb':0.82,
             'K':0.82}

B_EN_dict = {'prop':'B_EN',
              'Ca':1.00,
              'Sr':0.95,
              'Ba':0.89,
              'Ge':2.01,
              'Sn':1.96,
              'Pb':2.33} 

C_EN_dict = {'prop':'C_EN',
             'Cl':3.16, 
             'Br':2.96,
             'I':2.66}

# 6. Polarizability for B site ions
# Source : https://www.tandfonline.com/doi/full/10.1080/00268976.2018.1535143

B_pol_dict = {'prop':'B_pol',
               'Ca':160.8, 
               'Sr':197.2,
               'Ba':272,
               'Ge':40,
               'Sn':53,
               'Pb':47}


#Formamidinium : HC[NH2]2
#Formamidinium [NH2(CH)NH2]+ can be confused with methyl hydrazine which also has the formula CN2H6
#Methylammonium [(CH3)NH3]+  -> CNH5

### Perovskite design choice database

In [None]:
# Load the perovskite efficiencies dataset

dataset_dir = 'PSC_efficiencies_dataset.csv'
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)
print('Initial shape')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')

#######################################################################################################
# A ion composition processing
#######################################################################################################

# A site ions separated by ;
# In case of layered structures ions are separted by |
# The dataset only includes ions that go into the perovskite structure.
# Ions in secondary phases, amorphous grain boundaries, 
# or which disappear during syntheis instead come under dopants and additives.

features = ['Perovskite_composition_a_ions', 
            'Perovskite_composition_a_ions_coefficients']

# Step 1 : Drop empty samples
for feature in features:
    perovskites_3D[feature].replace('', np.nan, inplace=True)
    perovskites_3D = perovskites_3D.dropna(subset=feature)

# Step 2 : Retaining only species for which AE1 was trained
# species : 
perovskites_3D = perovskites_3D[(perovskites_3D[features[0]]=='MA') |
                                (perovskites_3D[features[0]]=='FA') |
                                (perovskites_3D[features[0]]=='Cs') |
                                (perovskites_3D[features[0]]=='Rb') |
                                (perovskites_3D[features[0]]=='K') |
                                (perovskites_3D[features[0]]=='FA; MA') |
                                (perovskites_3D[features[0]]=='Cs; FA') |
                                (perovskites_3D[features[0]]=='Cs; MA') |
                                (perovskites_3D[features[0]]=='Cs; Rb') |
                                (perovskites_3D[features[0]]=='Cs; K') |
                                (perovskites_3D[features[0]]=='Cs; FA; MA') |
                                (perovskites_3D[features[0]]=='Cs; FA; Rb') |
                                (perovskites_3D[features[0]]=='Cs; FA; MA; Rb')]
    
# Step 3 : Drop samples which do not have entries in two columns
perovskites_3D = perovskites_3D.dropna(subset=features)

print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 4 : Split column based on delimiter
new_col_names = ['a_1', 
                 'a_2',
                 'a_3',
                 'a_4']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[0],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value='na')

print('\n')
print('Splitting the perovskite ions column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)


new_col_names = ['a_1_coeff', 
                 'a_2_coeff',
                 'a_3_coeff',
                 'a_4_coeff']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[1],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value=0)

print('\n')
print('Splitting the perovskite ion coeffs column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 5 : Split column based on delimiterDropping columns where the coefficients dont sum upto 1'

perovskites_3D = perovskites_3D[perovskites_3D['a_1_coeff'].astype(float)
                                + perovskites_3D['a_2_coeff'].astype(float)
                                + perovskites_3D['a_3_coeff'].astype(float)
                                + perovskites_3D['a_4_coeff'].astype(float) == float(1)]

# Step 6 : Finally inserting the property features

insert_col_loc = list(perovskites_3D.columns).index('a_4_coeff')
ion_cols = ['a_1', 'a_2', 'a_3', 'a_4']
ion_coeff_cols = ['a_1_coeff', 'a_2_coeff', 'a_3_coeff', 'a_4_coeff']

# A site property dictionary
A_site_prop_dict_list = [A_ion_radius_dict, 
                         A_atomic_wt_dict, 
                         A_EA_dict,
                         A_IE_dict,
                         A_EN_dict]

perovskites_3D = insert_properties(perovskites_3D, 
                                   ion_coeff_cols=ion_coeff_cols,
                                   ion_cols=ion_cols,
                                   prop_dict_list=A_site_prop_dict_list,
                                   insert_col_loc=insert_col_loc)

print('\n')
print('Inserted the property features.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

In [None]:
#
# B ion composition processing
#

# Notes : Additives and  dopants to C site. Example is Cl in MAPbI3.
# Notes : Cl does not go into the perovskite structure.
# Notes : Hence should not be considered as C site cation but instead view it as a dopant/additive.

dataset_dir = 'PSC_efficiencies_dataset.csv'
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)

#######################################################################################################

features = ['Perovskite_composition_b_ions',
            'Perovskite_composition_b_ions_coefficients']

# Ca, Sr, Ba, Ge, Sn, Pb

# Drop empty samples
for feature in features:
    perovskites_3D[feature].replace('', np.nan, inplace=True)
    perovskites_3D = perovskites_3D.dropna(subset=feature)
    
# Retain specified samples
perovskites_3D = perovskites_3D[(perovskites_3D[features[0]]=='Pb') |
                                (perovskites_3D[features[0]]=='Sn') |
                                (perovskites_3D[features[0]]=='Ge') |
                                (perovskites_3D[features[0]]=='Pb; Sn') |
                                (perovskites_3D[features[0]]=='Ba; Pb') |
                                (perovskites_3D[features[0]]=='Pb; Sr') |
                                (perovskites_3D[features[0]]=='Ge; Sn')]   
    
# Drop samples which do not have entries in two columns
perovskites_3D = perovskites_3D.dropna(subset=features)

print('\n')
print(' Dropping empty samples and specified samples.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

#######################################################################################################

new_col_names = ['b_1', 'b_2']
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[0],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value='na')

print('\n')
print('Splitting the perovskite ions column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

new_col_names = ['b_1_coeff', 'b_2_coeff']
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[1],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value=0)

print('\n')
print('Splitting the perovskite ion coeffs column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

#######################################################################################################

# Dropping columns where the coefficients dont sum upto 1'
perovskites_3D = perovskites_3D[perovskites_3D['b_1_coeff'].astype(float)
                                + perovskites_3D['b_2_coeff'].astype(float) == float(1)]

#######################################################################################################

# Finally inserting the property features
insert_col_loc = list(perovskites_3D.columns).index('b_2_coeff')
ion_cols = ['b_1', 'b_2']
ion_coeff_cols = ['b_1_coeff', 'b_2_coeff']

# A site property dictionary
B_site_prop_dict_list = [B_ion_radius_dict,
                         B_atomic_wt_dict, 
                         B_EA_dict,
                         B_IE_dict,
                         B_EN_dict,
                         B_pol_dict]

perovskites_3D = insert_properties(perovskites_3D, 
                                  ion_coeff_cols=ion_coeff_cols,
                                  ion_cols=ion_cols,
                                  prop_dict_list=B_site_prop_dict_list,
                                  insert_col_loc=insert_col_loc)

print('\n')
print('Inserted the property features.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

In [None]:
#
# C ion composition processing
#

dataset_dir = 'PSC_efficiencies_dataset.csv'
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)

#######################################################################################################

features = ['Perovskite_composition_c_ions', 
            'Perovskite_composition_c_ions_coefficients']

# Cl, Br, I
b_samples_to_drop = [['O', '(SCN); I', 'Br; F; I', '(BF4); I', 'S', 'I; SCN', 'I | I'], 
                     ['x', 'x; x']]

# Drop empty samples
for feature in features:
    perovskites_3D[feature].replace('', np.nan, inplace=True)
    perovskites_3D = perovskites_3D.dropna(subset=feature)
    
# Drop specified samples
perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=features, 
                              drop_samples=b_samples_to_drop,
                              suppress_output=False)    
    
# Drop samples which do not have entries in two columns
perovskites_3D = perovskites_3D.dropna(subset=features)

print('\n')
print(' Dropping empty samples and specified samples.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

#######################################################################################################

new_col_names = ['c_1', 
                 'c_2']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[0],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value='na')

print('\n')
print('Splitting the perovskite ions column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

new_col_names = ['c_1_coeff', 
                 'c_2_coeff']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[1],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value=0)

print('\n')
print('Splitting the perovskite ion coeffs column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)


#######################################################################################################

# Dropping columns where the coefficients dont sum upto 1'

perovskites_3D = perovskites_3D[perovskites_3D['c_1_coeff'].astype(float)
                                + perovskites_3D['c_2_coeff'].astype(float) == float(3)]

#######################################################################################################

# Finally inserting the property features
insert_col_loc = list(perovskites_3D.columns).index('c_2_coeff')
ion_cols = ['c_1', 'c_2']
ion_coeff_cols = ['c_1_coeff', 'c_2_coeff']

# A site property dictionary
C_site_prop_dict_list = [C_ion_radius_dict,
                         C_atomic_wt_dict, 
                         C_EA_dict,
                         C_IE_dict,
                         C_EN_dict]

perovskites_3D = insert_properties(perovskites_3D, 
                                   ion_coeff_cols=ion_coeff_cols,
                                   ion_cols=ion_cols,
                                   prop_dict_list=C_site_prop_dict_list,
                                   insert_col_loc=insert_col_loc)

# Divide all calculated properties by 3
selected_columns = ['C_ion_radius', 
                    'C_atom_wt',
                    'C_EN',
                    'C_IE',
                    'C_EA']

perovskites_3D[selected_columns] = perovskites_3D[selected_columns].apply(lambda x: x/3)

print('\n')
print('Inserted the property features.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

### Insert Cell Properties

In [None]:
dataset_dir = 'PSC_efficiencies_dataset.csv'
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)

insert_col_loc = list(perovskites_3D.columns).index('C_EN')

tf = (perovskites_3D.A_ion_radius + perovskites_3D.C_ion_radius)/((2**0.5)*(perovskites_3D.B_ion_radius + perovskites_3D.C_ion_radius))
of = perovskites_3D.B_ion_radius/perovskites_3D.C_ion_radius

perovskites_3D = insert_features(perovskites_3D, insert_col_loc + 1, 'Tolerance Factor', tf)
perovskites_3D = insert_features(perovskites_3D, insert_col_loc + 2, 'Octahedral Factor', of)

print('\n')
print('Inserted cell properties.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=True)

### Select the number of reaction steps

In [None]:
#######################################################################################################
# Feature : Perovskite_deposition_number_of_deposition_steps
#######################################################################################################

# Note : Choosing 1 or two step depositions as they make up the majority of deposition techniques
# Note : Depositing the perovskite first and then crystallization is considered a 2 step process.
# Note : Every step considered to have its own thermal history.
# Note : Spin coating with antisolvent is considered as a singel step.

variable = ['Perovskite_deposition_number_of_deposition_steps']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=variable)

print('\n')
print(f'Dropped empty samples from {variable[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 2 : Keep/Drop specific samples

#perovskites_3D = perovskites_3D[(perovskites_3D[variable[0]] == 1) |
#                                (perovskites_3D[variable[0]] == 2)]

perovskites_3D = perovskites_3D[(perovskites_3D[variable[0]] == 1)]

print('\n')
print('Retained 1 and 2 step deposition techniques.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

#######################################################################################################
# Feature : Perovskite_deposition_procedure
#######################################################################################################

# Note : Thermal annealing is generally not considered as an individual reaction step.
# Note : Antisolvent treatment is considered a different step.

variable = ['Perovskite_deposition_procedure']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=variable)

print('\n')
print(f'Dropped empty samples from {variable[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 2 : Drop specific samples

samples_to_drop = [['Spin-coating >> Gas reaction']]

perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=variable, 
                              drop_samples=samples_to_drop,
                              suppress_output=False)

# Step 3 : Choose only spincoating as the depsoition procedure 

perovskites_3D = perovskites_3D[perovskites_3D['Perovskite_deposition_procedure'] == 'Spin-coating']

print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

In [None]:
perovskites_3D['Perovskite_deposition_procedure'].value_counts()

### Perovskite Manufacturing Properties

In [None]:
# Read the dataset here
#######################################################################################################

dataset_dir = 'PSC_efficiencies_dataset.csv'
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)

#######################################################################################################

# Notes :
# 1. Perovskite additives only mentioned for 12,983 samples
#    1.1 Of these 12,983 samples, nearly half them doped with Cl
#    1.2 1180 Undoped and 503 Unknown
#    1.3 326 with 5-AVAI 
#    1.4 204 with SnF2
#    1.5 187 with HI
#    1.6 Data dictionary mentions this is a category with a lot of uncertainity. 
#    1.7 Blank field may not guarantee layer was undoped.
# 2. Perovskite additive concentrations entione din various units need to convert all of them into one common unit
#    2.1 For most unit is not mentioned in which case it likley is a stoichiometric coefficient 
#    [https://www.cell.com/joule/pdf/S2542-4351(18)30245-9.pdf, https://onlinelibrary.wiley.com/doi/10.1002/solr.201700224]
# 3. Perovskite_composition_none_stoichiometry_components_in_excess also plays an important role in determining thickness

#######################################################################################################

# Feature : Perovskite_composition_none_stoichiometry_components_in_excess
# Note : Inserting a new column that specifies if excess concentration was used or not 
# Note : Values taken Stoichiometric, Excess, Unknown

'''
samples = perovskites_3D['Perovskite_composition_none_stoichiometry_components_in_excess'].values

def _map_func(sample):
    if (not pd.isna(sample) and (sample != 'Stoichiometric')):
        return 'Excess'
    else:
        return sample
    
insert_col_loc = list(perovskites_3D.columns).index('Perovskite_composition_none_stoichiometry_components_in_excess')

new_col_data = list(map(_map_func, samples))

perovskites_3D = insert_features(perovskites_3D, insert_col_loc, 'excess_or_not', new_col_data)

print('\n')
print('Inserted If excess solution was used or not.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=True)

'''

#######################################################################################################

# Feature : Perovskite_composition_none_stoichiometry_components_in_excess
# Note : Inserting a new column that specifies if dopant was used or not 
# Note : BLANK FIELD DOES NOT GUARANTEE THAT DOPANT WAS NOT USED

'''
# Drop samples which have 'unknown' doping solution
print('Droping samples that have unknown dopants')
samples_to_drop = [['Unknown']]
perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=['Perovskite_additives_compounds'], 
                              drop_samples=samples_to_drop, 
                              suppress_output=False)
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')

samples = perovskites_3D['Perovskite_additives_compounds'].values
def _map_func(sample):
    if (not pd.isna(sample) and (sample != 'Undoped')):
        return 'Doped'
    else:
        return sample
    
new_col_data = list(map(_map_func, samples))
features_list = list(perovskites_3D.columns)
col_idx = features_list.index('Perovskite_additives_compounds')
perovskites_3D = insert_features(perovskites_3D, col_idx, 'doped_or_not', new_col_data)

print('Inserting new column that indicates whether dopants/additives were added.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('3D_perovskites.csv', header=True, index=True)
'''

#######################################################################################################
# Feature : Perovskite_deposition_solvents
#######################################################################################################

# Notes : For non liquid processes solvent is stated as None.
# Notes : If solvent is not known stated as Unknown.

variable = ['Perovskite_deposition_solvents']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=variable)

print('\n')
print(f'Dropped empty samples from {variable[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 2 : Keep/Drop specific samples
samples_to_drop = [['Unknown']]

perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=variable, 
                              drop_samples=samples_to_drop,
                              suppress_output=False)
      
print('\n')
print('Dropped sample "Unknown". ')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

'''
# Step 3 : Split solvents used in the 2 steps
new_var_names = ['Step_1_solvent', 
                 'Step_2_solvent',
                 'Step_3_solvent']
delimiter = ' >> '
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=variable[0],
                                                 new_col_names=new_var_names,
                                                 delimiter=delimiter,
                                                 suppress_output=False,
                                                 fill_value='na')

print('\n')
print('Split the perovskite deposition column based on delimiter ">>". ')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
'''


#######################################################################################################
# Feature : Perovskite_deposition_solvents_mixing_ratios
#######################################################################################################

# Notes : For pure solvents, state the mixing ratio as 1
# Notes : For non-solvent processes, state the mixing ratio as 1
# Notes : For unknown mixing ratios, state the mixing ratio as ‘nan’
# Notes : For solvent mixtures, i.e. A and B, state the mixing ratios by using semicolons, as in (VA; VB)
# Notes : The preferred metrics is the volume ratios. If that is not available, mass or mol ratios can be used
#         instead, but it the analysis the mixing ratios will be assumed to be based on volumes.

variable = ['Perovskite_deposition_solvents_mixing_ratios']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=variable)

print('\n')
print(f'Dropped empty samples from {variable[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 2 : Split solvent parts into two cols
# Mixing ratios specified as volume ratios
new_var_names = ['Solvent_mix_part_1', 
                 'Solvent_mix_part_2']
delimiter = '; '
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=variable[0],
                                                 new_col_names=new_var_names,
                                                 delimiter=delimiter,
                                                 suppress_output=False,
                                                 fill_value=0)

print('\n')
print('Split the perovskite deposition column based on delimiter ";". ')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

'''
# Step 3 : Drop 'nan' samples

perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=['Step_1_solvent_mix_part_1'], 
                              drop_samples=[['nan']],
                              suppress_output=False)
'''

#######################################################################################################
# Quenching induced crystallization
#######################################################################################################

# Notes:
# --> Quenching is required to remove the solvents that have been deposited in any of the previous steps.
# --> Types of quenching generally used is anti solvent or N2 quenching. 
# --> This is used to identify processes where crystalization process was accelerated without changing the temp such as using an anti solvent.
# --> Very little data related to quenching media and volume utilized (unknown for 22,000 points)
# --> Very little data on quenching media additives (around 500 points)
# --> Very little data on quenching media concentrations (around 10 points)
# --> Very little data on quenching media mizing ration (around 500 points)

variables = ['Perovskite_deposition_quenching_induced_crystallisation']

# Convert TRUE/ FALSE to 0/1
perovskites_3D[variables[0]] = perovskites_3D[variables[0]].astype(int)

print('\n')
print(f'Convert T/F to 0/1 for {variables[0]}')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

#######################################################################################################
# Thermal Annealing and Solvent Annealing step
#######################################################################################################

# Notes : Two types of annealing methodologies : Thermal and Solvent Annealing.
# Notes : Each reaction step will have its own thermal annealing temp and time.
# Notes : Dropping thermal annealing atmosphre, RH, pressure as is unknown for a majority of data points.
# Notes : If no thermal annealing is occurring after the deposition of a layer, state that by stating 
#         the room temperature (assumed to 25°C if not further specified).
# Notes : If the thermal annealing program is not known, state that by ‘nan’

variables = ['Perovskite_deposition_thermal_annealing_temperature',
             'Perovskite_deposition_thermal_annealing_time']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=[variables[0]])

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=[variables[1]])

print('\n')
print(f'Dropped empty samples from {variables[1]} and {variables[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

# Step 3 : Drop/Keep specific samples
samples_to_drop = [['Unknown']]

perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=[variables[0]], 
                              drop_samples=samples_to_drop,
                              suppress_output=False)

perovskites_3D = perovskites_3D[~perovskites_3D[variables[0]].str.contains(';')]

perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=[variables[1]], 
                              drop_samples=samples_to_drop,
                              suppress_output=False)

perovskites_3D = perovskites_3D[~perovskites_3D[variables[1]].str.contains(';')]

print('\n')
print('Dropped samples.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

'''
# Step 2 : Split TA temp used in the 2 steps
new_var_names = ['Step_1_TA_temp', 
                 'Step_2_TA_temp']
delimiter = ' >> '
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=variables[0],
                                                 new_col_names=new_var_names,
                                                 delimiter=delimiter,
                                                 suppress_output=False,
                                                 fill_value='na')

print('\n')
print('Split the perovskite deposition column based on delimiter ">>". ')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
'''

'''
# Step 4 : Split TA time used in the 2 steps

new_var_names = ['Step_1_TA_time', 
                 'Step_2_TA_time']
delimiter = ' >> '
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=variables[1],
                                                 new_col_names=new_var_names,
                                                 delimiter=delimiter,
                                                 suppress_output=False,
                                                 fill_value='na')

print('\n')
print('Split the perovskite deposition column based on delimiter ">>". ')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)
'''

### Drop categories which have low samples

In [None]:
# Since most variables will be one hot encoded use this to reduce the dimensionality of the vector

# 47 different categories for substrate
# 400 + categories for ETL
# 500 + catgeories for HTL
# 50 categories for perovskite deposition procedure, step 1 solvent, quenching media

variables = ['Substrate_stack_sequence',
             'ETL_stack_sequence',
             'HTL_stack_sequence']

cutoff = 100

for variable in variables:
    counts = perovskites_3D[variable].value_counts()
    print(counts)

    perovskites_3D = perovskites_3D[perovskites_3D[variable].isin(counts[counts >= cutoff].index)]

    print('\n')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

In [None]:
variables = ['Perovskite_deposition_solvents',
             'Perovskite_deposition_solvents_mixing_ratios',
             'Perovskite_deposition_thermal_annealing_temperature',
             'Perovskite_deposition_thermal_annealing_time']

cutoff = 10

for variable in variables:
    counts = perovskites_3D[variable].value_counts()
    print(counts)

    perovskites_3D = perovskites_3D[perovskites_3D[variable].isin(counts[counts >= cutoff].index)]

    print('\n')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)

### Drop duplicates

In [None]:
# By default removes duplicate rows based on all columns

variables_to_consider = ['Perovskite_composition_a_ions',
                        'Perovskite_composition_a_ions_coefficients',
                        'A_ion_radius',
                        'A_atom_wt',
                        'A_EN',
                        'A_IE',
                        'A_EA',
                        'Perovskite_composition_b_ions',
                        'Perovskite_composition_b_ions_coefficients',
                        'B_ion_radius',
                        'B_atom_wt',
                        'B_EN',
                        'B_IE',
                        'B_EA',
                        'B_pol',
                        'Perovskite_composition_c_ions',
                        'Perovskite_composition_c_ions_coefficients',
                        'C_ion_radius',
                        'C_atom_wt',
                        'C_EN',
                        'C_IE',
                        'C_EA',
                        'Tolerance Factor',
                        'Octahedral Factor',
                        'Perovskite_deposition_solvents',
                        'Solvent_mix_part_1',
                        'Solvent_mix_part_2',
                        'Perovskite_deposition_quenching_induced_crystallisation',
                        'Perovskite_deposition_thermal_annealing_temperature',
                        'Perovskite_deposition_thermal_annealing_time',
                        'Perovskite_deposition_solvent_annealing',
                        'Substrate_stack_sequence',
                        'ETL_stack_sequence',
                        'HTL_stack_sequence',
                        'Air or N2',
                        'JV_reverse_scan_Voc',
                        'JV_reverse_scan_Jsc',
                        'JV_reverse_scan_FF']

perovskites_3D = perovskites_3D.drop_duplicates(subset=variables_to_consider)

print('\n')
print('Dropped duplicates.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv('PSC_efficiencies_dataset.csv', header=True, index=False)