In [1]:
import numpy as np
import pandas as pd

### Functions to help filter data

In [2]:
def drop_empty_samples(dataframe, **kwargs):
    """Drop samples that are empty from the dataframe"""
    
    for feature in kwargs['drop_empty_samples_from']:
        dataframe[feature].replace('', np.nan, inplace=True)
        dataframe = dataframe.dropna(subset=feature)
    
    return dataframe

# Note : Must provide samples to drop in list format
def drop_samples(dataframe, **kwargs):
    """Drop samples from the dataframe"""
    
    for i,feature in enumerate(kwargs['drop_samples_from']):
        
        for sample in kwargs['drop_samples'][i]:

            drop_idx = dataframe[dataframe[feature] == sample].index
            num_samples_dropped = len(drop_idx)
            if kwargs['suppress_output'] == False:
                print('\n')
                print(f' --> Dropping sample {sample}')
                print(f' --> Number of samples dropped {num_samples_dropped}')
            dataframe = dataframe.drop(axis=0, index=drop_idx)
    
    return dataframe

def split_column_based_on_delimiter(dataframe, **kwargs):
    """Split a column based on a delimiter and insert the new columns into the dataframe"""
    
    # Split the column
    split_df = dataframe[kwargs['split_col']].str.split(kwargs['delimiter'], expand=True).fillna(value=kwargs['fill_value'])
    col_idx = list(dataframe).index(kwargs['split_col'])
    dataframe = pd.concat([dataframe.iloc[:,:(col_idx) + 1], \
                           split_df, \
                           dataframe.iloc[:,(col_idx + 1):]], axis=1)
    
    # Rename the columns
    for i, new_col_name in enumerate(kwargs['new_col_names']):
        dataframe = dataframe.rename(columns={i:new_col_name})
    
    return dataframe

def insert_properties(dataframe, **kwargs):
    """Insert properties into the dataframe"""
        
    # Get the array of ion coefficients
    ions_coeffs_array = dataframe[kwargs['ion_coeff_cols']].values.astype(np.float32)
    
    # Get the array of ions
    ions_array = np.char.strip(dataframe[kwargs['ion_cols']].values.astype('U4'))
    
    # Iterate through each property dictionary
    for i, prop_dict in enumerate(kwargs['prop_dict_list']):
    
        # Create a property arraya to store the ion properties
        ions_prop_array = np.zeros(ions_array.shape, dtype=np.float32)
    
        # Replace ions in the list with prop value 
        for ion in list(prop_dict.keys())[1:]:
            np.place(ions_prop_array, ions_array==ion, prop_dict[ion])
    
        # Multiply prop value with coefficent and reduce along axis=1
        col_data = np.sum(np.multiply(ions_prop_array, ions_coeffs_array), axis=1)

        dataframe.insert(i+1+kwargs['insert_col_loc'], prop_dict['prop'], col_data)
    
    return dataframe

### Reading the dataset

In [3]:
dataset_dir = 'jacobsson2021dataset.csv'
dataframe = pd.read_csv(dataset_dir, header=0, low_memory=False)
print('Initial shape')
print(f' --> Perovskites 3D shape {dataframe.shape}')

dataset_save_name = 'PSC_efficiencies_dataset.csv'

Initial shape
 --> Perovskites 3D shape (42443, 410)


### Filter for 3D perovskites 

In [4]:
# Module is defined as a Cell composed of connected individual sub cells ?
module = False
perovskite_single_crystal = False
# Perovskite 0D is perovskite quantum dots
perovskite_0D = False
# Perovskite 2D is layered perovskite with large A site cation 
perovskite_2D = False
# Perovskite 2D3D mixture is reduced dimensional perovskite but not pure 2d PEROVSKITE
perovskite_2D3D = False
# Perovskite 3D is standard 3D perovskite with ABC3 structure
perovskite_3D = True
# Perovskite 3D with 2D capping layer is where bulk of perovskite is 3D but consists of a small 2D capping layer on top 
perovskite_3D_with_2D_capping_layer = False
# Perovskite with ABC3 structure 
perovskite_with_ABC3_structure = True
# Some cells can be perovskite inspired but not actually perovskites
perovskite_inspired = False
# Dimensionality of perovskite should be 3
perovskite_dim = '3'

# Only select those entries which satisify constraints provided
perovskites_3D = dataframe[(dataframe['Module'] == module) &
                           (dataframe['Perovskite_single_crystal'] == perovskite_single_crystal) &
                           (dataframe['Perovskite_dimension_0D'] == perovskite_0D) &
                           (dataframe['Perovskite_dimension_2D'] == perovskite_2D) &
                           (dataframe['Perovskite_dimension_2D3D_mixture'] == perovskite_2D3D) &
                           (dataframe['Perovskite_dimension_3D'] == perovskite_3D) &  
                           (dataframe['Perovskite_dimension_3D_with_2D_capping_layer'] == perovskite_3D_with_2D_capping_layer) &
                           (dataframe['Perovskite_composition_perovskite_ABC3_structure'] == perovskite_with_ABC3_structure) &
                           (dataframe['Perovskite_composition_perovskite_inspired_structure'] == perovskite_inspired) &
                           (dataframe['Perovskite_dimension_list_of_layers'] == perovskite_dim)]

#
# Selecting the perovskite and bandgap variables for the dataset
#

perovskite_variables = ['Perovskite_single_crystal',
                        'Perovskite_dimension_0D',
                        'Perovskite_dimension_2D',
                        'Perovskite_dimension_2D3D_mixture',
                        'Perovskite_dimension_3D',
                        'Perovskite_dimension_3D_with_2D_capping_layer',
                        'Perovskite_dimension_list_of_layers',
                        'Perovskite_composition_perovskite_ABC3_structure',
                        'Perovskite_composition_perovskite_inspired_structure',
                        'Perovskite_composition_long_form',
                        'Perovskite_composition_a_ions',
                        'Perovskite_composition_a_ions_coefficients',
                        'Perovskite_composition_b_ions',
                        'Perovskite_composition_b_ions_coefficients',
                        'Perovskite_composition_c_ions',
                        'Perovskite_composition_c_ions_coefficients',
                        'Perovskite_band_gap',
                        'Perovskite_band_gap_graded',
                        'Perovskite_band_gap_estimation_basis']

#
# Selecting the deposition variables for the dataset
#

deposition_variables = ['Perovskite_deposition_number_of_deposition_steps',
                        'Perovskite_deposition_procedure',
                        'Perovskite_deposition_quenching_induced_crystallisation']

#
# Selecting the substrate variables for the dataset
#

substrate_variables = ['Substrate_stack_sequence']

#
# Selecting the ETM variables for the dataset
#

etm_variables = ['ETL_stack_sequence']

#
# Selecting the ETM variables for the dataset
#

htm_variables = ['HTL_stack_sequence']

#
# Selecting the JV properties
#

JV_variables = ['JV_average_over_n_number_of_cells',
                'JV_test_atmosphere',
                'JV_light_intensity',
                'JV_light_spectra',
                'JV_light_masked_cell',
                'JV_test_temperature',
                'JV_scan_speed',
                'JV_reverse_scan_Voc',
                'JV_reverse_scan_Jsc',
                'JV_reverse_scan_FF',
                'JV_reverse_scan_PCE']

keep_cols = []
keep_cols.extend(perovskite_variables)
keep_cols.extend(deposition_variables)
keep_cols.extend(substrate_variables)
keep_cols.extend(etm_variables)
keep_cols.extend(htm_variables)
keep_cols.extend(JV_variables)

perovskites_3D = perovskites_3D.loc[:, keep_cols]
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')

perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

 --> Perovskites 3D shape (39780, 36)


### Filter for Perovskite properties

In [5]:
# Filter samples which have band gap data
filter_for_band_gap = False
filter_for_pl_max = False
filter_for_JV_props = True

if filter_for_band_gap:
    
    # Drop empty samples
    perovskites_3D = drop_empty_samples(perovskites_3D, 
                                        drop_empty_samples_from=['Perovskite_band_gap'])
    
    # Drop samples which do not have a estimation basis provided
    perovskites_3D = drop_empty_samples(perovskites_3D, 
                                        drop_empty_samples_from=['Perovskite_band_gap_estimation_basis'])

    # Drop sample with band_gap_graded = True
    # Removing samples whose band gap varies as a vertical position in the perovskite layer.
    samples_to_drop = [[True]]
    perovskites_3D = drop_samples(perovskites_3D,\
                                  drop_samples_from=['Perovskite_band_gap_graded'],
                                  drop_samples=samples_to_drop,
                                  suppress_output=True)
    
    #perovskites_3D = perovskites_3D[perovskites_3D['Perovskite_band_gap_estimation_basis'] != 'Composition']
    
    print('Choosing only those samples for which band gap information was recorded.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
# Filter for samples which have steady state pl max data.
if filter_for_pl_max:

    perovskites_3D = drop_empty_samples(perovskites_3D, ['Perovskite_pl_max'])
    
    print('Choosing only those samples for which steady state pl max information was recorded.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
if filter_for_JV_props:
    
    #######################################################################################################
    
    # For analysis we consider all JV measurements done at
    # 1. Temp : 25 C
    # 2. Light intensity : 100 mW/cm2
    # 3. Light spectra : AM1.5
    # 4. Light not passed through a mask
    
    # Feature : JV_average_over_n_number_of_cells 
    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_average_over_n_number_of_cells'])
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_average_over_n_number_of_cells'] == 1]
    
    print('\n')
    print(f' --> Dropped samples with number of cells not equal to 1')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_test_atmosphere 
    # Note : Atmosphere in which JV measurement is performed
    # Note : Remove all samples with 'Unknown' - 21929 ; 'Vacuum' - 33 ; 'Near-space' - 4 ; 'Water' - 1
    # Ambient - Relative humidity unknown
    # Dry air - Low realtive humidity
    
    perovskites_3D = perovskites_3D[(perovskites_3D['JV_test_atmosphere'] == 'Air') | 
                                    (perovskites_3D['JV_test_atmosphere'] == 'N2') | 
                                    (perovskites_3D['JV_test_atmosphere'] == 'Ambient') |
                                    (perovskites_3D['JV_test_atmosphere'] == 'Dry air')]
    
    
    # Insert another column where the categories are just Air and  N2
    insert_col_loc = list(perovskites_3D.columns).index('JV_test_atmosphere')
    perovskites_3D.insert(insert_col_loc, 'Air or N2', perovskites_3D['JV_test_atmosphere'].apply(lambda x: 'Air' if ((x == 'Air') or
                                                                                                           (x == 'Ambient') or 
                                                                                                           (x == 'Dry air')) else 'N2'))
    
    print('\n')
    print(' --> Dropped samples without JV test atmosphere conditions Unknown, Vacuum, Near space or water.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_test_temperature
    # Note : Assume standard room temperature of 25C if temperature is not controlled or not known
    
    test_temps_exclude = list(perovskites_3D['JV_test_temperature'].value_counts().index)
    test_temps_exclude.remove(25)
    
    perovskites_3D = perovskites_3D[~perovskites_3D['JV_test_temperature'].isin(test_temps_exclude)]
    
    print('\n')
    print(' --> Dropped samples where temperature is not 25 C')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_light_intensity
    # Note : Keep only 100 mW/cm2
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_light_intensity'] == 100]
    
    print('\n')
    print(' --> Dropped samples where light spectra is not 100 mW/cm2.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
    #######################################################################################################

    # Feature : JV_light_spectra
    # Note : Keep only AM 1.5 
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_light_spectra'] == 'AM 1.5']
    
    print('\n')
    print(' --> Dropped samples where light spectra is not AM 1.5.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
    #######################################################################################################
    
    # Feature : JV_light_masked_cell
    # Note :  TRUE if the cell is illuminated trough a mask with an opening that is smaller than the total cell area
    
    perovskites_3D = perovskites_3D[perovskites_3D['JV_light_masked_cell'] == False]
    
    print('\n')
    print(' --> Dropped samples where light passed through a mask.')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)
    
    #######################################################################################################
    
    # Keep samples that have reverse FF calculated
    # Whe scan direction not stated considered to be in the reverse direction

    perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=['JV_reverse_scan_PCE'])
    
    print('\n')
    print(' --> Dropped samples without reverse scan PCE data')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)



 --> Dropped samples with number of cells not equal to 1
 --> Perovskites 3D shape (27047, 36)


 --> Dropped samples without JV test atmosphere conditions Unknown, Vacuum, Near space or water.
 --> Perovskites 3D shape (12047, 37)


 --> Dropped samples where temperature is not 25 C
 --> Perovskites 3D shape (12009, 37)


 --> Dropped samples where light spectra is not 100 mW/cm2.
 --> Perovskites 3D shape (11855, 37)


 --> Dropped samples where light spectra is not AM 1.5.
 --> Perovskites 3D shape (11002, 37)


 --> Dropped samples where light passed through a mask.
 --> Perovskites 3D shape (10909, 37)


 --> Dropped samples without reverse scan PCE data
 --> Perovskites 3D shape (10466, 37)


### Perovskite Element Properties

In [6]:
# Property dictionaries

# 1. Atomic radius

A_ion_radius_dict = {'prop':'A_ion_radius',
                     'MA':2.17, 
                     'FA':2.53,
                     'Cs':1.74,
                     'Rb':1.61,
                     'K':1.51}

B_ion_radius_dict = {'prop':'B_ion_radius',
                     'Ca':1.00,
                     'Sr':1.26,
                     'Ba':1.42,
                     'Ge':0.87, 
                     'Sn':1.15,
                     'Pb':1.19}

C_ion_radius_dict = {'prop':'C_ion_radius',
                     'Cl':1.81,
                     'Br':1.96,
                     'I':2.20}

# 2. Atomic weight

A_atomic_wt_dict = {'prop':'A_atom_wt',
                    'MA':32.07, 
                    'FA':45.06,
                    'Cs':132.91, 
                    'Rb':85.47, 
                    'K':39.10} 

B_atomic_wt_dict = {'prop':'B_atom_wt',
                    'Ca': 40.078,
                    'Sr': 87.62,
                    'Ba': 137.33,
                    'Ge': 72.63,
                    'Sn': 118.71,
                    'Pb': 207.2}

C_atomic_wt_dict = {'prop':'C_atom_wt',
                    'Cl': 35.45,
                    'Br': 79.90,
                    'I': 126.90}

# 3. EA

A_EA_dict = {'prop':'A_EA',
             'FA':8.60,
             'MA':8.68,
             'Cs':45.49,
             'Rb':46.89,
             'K':48.36}

B_EA_dict = {'prop':'B_EA',
             'Ca':-156,
             'Sr':-168,
             'Ba':-52,
             'Ge':120,
             'Sn':121,
             'Pb':110}

C_EA_dict = {'prop':'C_EA',
             'Cl':348.8, 
             'Br':324.6,
             'I':295.3}


# 4. IE

A_IE_dict = {'prop':'A_IE',
             'FA':849.10,
             'MA':926.30,
             'Cs':375.70,
             'Rb':403,
             'K':418.80}

B_IE_dict = {'prop':'B_IE',
             'Ca':589.9,
             'Sr':549.5,
             'Ba':502.9,
             'Ge':762,
             'Sn':708.6,
             'Pb':715.6}

C_IE_dict = {'prop':'C_IE',
             'Cl':1251.20, 
             'Br':1139.90,
             'I':1008.40}

# 5. EN

A_EN_dict = {'prop':'A_EN',
             'MA':2.55,
             'FA':2.57,
             'Cs':0.79,
             'Rb':0.82,
             'K':0.82}

B_EN_dict = {'prop':'B_EN',
              'Ca':1.00,
              'Sr':0.95,
              'Ba':0.89,
              'Ge':2.01,
              'Sn':1.96,
              'Pb':2.33} 

C_EN_dict = {'prop':'C_EN',
             'Cl':3.16, 
             'Br':2.96,
             'I':2.66}

### A site ion properties

In [7]:
# A site ions separated by ;
# In case of layered structures ions are separted by |
# The dataset only includes ions that go into the perovskite structure.
# Ions in secondary phases, amorphous grain boundaries, 
# or which disappear during syntheis instead come under dopants and additives.

# Adding the property features to the dataset involves the following steps:
# 1. Drop empty samples
# 2. Retain only those ions for which AE1 was trained
# 3. Drop samples which do not have entries in Perovskite_composition_a_ions and Perovskite_composition_a_ions_coefficients
# 4. Split the Perovskite_composition_a_ions and Perovskite_composition_a_ions_coefficients columns based on delimiter
# 5. Sum up the ion coefficient columns and drop samples where the sum is not equal to 1
# 6. Insert the property features

features = ['Perovskite_composition_a_ions', 
            'Perovskite_composition_a_ions_coefficients']

# Step 1
for feature in features:
    perovskites_3D[feature].replace('', np.nan, inplace=True)
    perovskites_3D = perovskites_3D.dropna(subset=feature)

# Step 2
perovskites_3D = perovskites_3D[(perovskites_3D[features[0]]=='MA') |
                                (perovskites_3D[features[0]]=='FA') |
                                (perovskites_3D[features[0]]=='Cs') |
                                (perovskites_3D[features[0]]=='Rb') |
                                (perovskites_3D[features[0]]=='K') |
                                (perovskites_3D[features[0]]=='FA; MA') |
                                (perovskites_3D[features[0]]=='Cs; FA') |
                                (perovskites_3D[features[0]]=='Cs; MA') |
                                (perovskites_3D[features[0]]=='Cs; Rb') |
                                (perovskites_3D[features[0]]=='Cs; K') |
                                (perovskites_3D[features[0]]=='Cs; FA; MA') |
                                (perovskites_3D[features[0]]=='Cs; FA; Rb') |
                                (perovskites_3D[features[0]]=='Cs; FA; MA; Rb')]
    
# Step 3
perovskites_3D = perovskites_3D.dropna(subset=features)

print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 4
new_col_names = ['a_1', 
                 'a_2',
                 'a_3',
                 'a_4']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[0],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value='na')

print('\n')
print('Splitting the perovskite ions column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

new_col_names = ['a_1_coeff', 
                 'a_2_coeff',
                 'a_3_coeff',
                 'a_4_coeff']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[1],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value=0)

print('\n')
print('Splitting the perovskite ion coeffs column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 5 

perovskites_3D = perovskites_3D[perovskites_3D['a_1_coeff'].astype(float)
                                + perovskites_3D['a_2_coeff'].astype(float)
                                + perovskites_3D['a_3_coeff'].astype(float) == float(1)]

# Step 6 

insert_col_loc = list(perovskites_3D.columns).index('a_4_coeff')
ion_cols = ['a_1', 'a_2', 'a_3', 'a_4']
ion_coeff_cols = ['a_1_coeff', 'a_2_coeff', 'a_3_coeff', 'a_4_coeff']

# A site property dictionary
A_site_prop_dict_list = [A_ion_radius_dict, 
                         A_atomic_wt_dict, 
                         A_EA_dict,
                         A_IE_dict,
                         A_EN_dict]

perovskites_3D = insert_properties(perovskites_3D, 
                                   ion_coeff_cols=ion_coeff_cols,
                                   ion_cols=ion_cols,
                                   prop_dict_list=A_site_prop_dict_list,
                                   insert_col_loc=insert_col_loc)

print('\n')
print('Inserted the property features.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

 --> Perovskites 3D shape (10386, 37)


Splitting the perovskite ions column based on delimiter
 --> Perovskites 3D shape (10386, 41)


Splitting the perovskite ion coeffs column based on delimiter
 --> Perovskites 3D shape (10386, 45)


Inserted the property features.
 --> Perovskites 3D shape (10202, 50)


### B site ion properties

In [8]:
# Notes : Additives and  dopants to C site. Example is Cl in MAPbI3.
# Notes : Cl does not go into the perovskite structure.
# Notes : Hence should not be considered as C site cation but instead view it as a dopant/additive.

features = ['Perovskite_composition_b_ions',
            'Perovskite_composition_b_ions_coefficients']

# Adding the property features to the dataset involves the following steps:
# 1. Drop empty samples
# 2. Retain only those ions for which AE1 was trained
# 3. Drop samples which do not have entries in Perovskite_composition_b_ions and Perovskite_composition_b_ions_coefficients
# 4. Split the Perovskite_composition_b_ions and Perovskite_composition_b_ions_coefficients columns based on delimiter
# 5. Sum up the ion coefficient columns and drop samples where the sum is not equal to 1
# 6. Insert the property features

# Step 1
for feature in features:
    perovskites_3D[feature].replace('', np.nan, inplace=True)
    perovskites_3D = perovskites_3D.dropna(subset=feature)
    
# Step 2
perovskites_3D = perovskites_3D[(perovskites_3D[features[0]]=='Pb') |
                                (perovskites_3D[features[0]]=='Sn') |
                                (perovskites_3D[features[0]]=='Ge') |
                                (perovskites_3D[features[0]]=='Pb; Sn') |
                                (perovskites_3D[features[0]]=='Ba; Pb') |
                                (perovskites_3D[features[0]]=='Pb; Sr') |
                                (perovskites_3D[features[0]]=='Ge; Sn')]   
    
# Step 3
perovskites_3D = perovskites_3D.dropna(subset=features)

print('\n')
print(' Dropping empty samples and specified samples.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 4
new_col_names = ['b_1', 'b_2']
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[0],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value='na')

print('\n')
print('Splitting the perovskite ions column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

new_col_names = ['b_1_coeff', 'b_2_coeff']
perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[1],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value=0)

print('\n')
print('Splitting the perovskite ion coeffs column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 5
perovskites_3D = perovskites_3D[perovskites_3D['b_1_coeff'].astype(float)
                                + perovskites_3D['b_2_coeff'].astype(float) == float(1)]

# Step 6
insert_col_loc = list(perovskites_3D.columns).index('b_2_coeff')
ion_cols = ['b_1', 'b_2']
ion_coeff_cols = ['b_1_coeff', 'b_2_coeff']

B_site_prop_dict_list = [B_ion_radius_dict,
                         B_atomic_wt_dict, 
                         B_EA_dict,
                         B_IE_dict,
                         B_EN_dict]

perovskites_3D = insert_properties(perovskites_3D, 
                                  ion_coeff_cols=ion_coeff_cols,
                                  ion_cols=ion_cols,
                                  prop_dict_list=B_site_prop_dict_list,
                                  insert_col_loc=insert_col_loc)

print('\n')
print('Inserted the property features.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)



 Dropping empty samples and specified samples.
 --> Perovskites 3D shape (10149, 50)


Splitting the perovskite ions column based on delimiter
 --> Perovskites 3D shape (10149, 52)


Splitting the perovskite ion coeffs column based on delimiter
 --> Perovskites 3D shape (10149, 54)


Inserted the property features.
 --> Perovskites 3D shape (10124, 59)


### C site ion properties

In [9]:
features = ['Perovskite_composition_c_ions', 
            'Perovskite_composition_c_ions_coefficients']

# Adding the property features to the dataset involves the following steps:
# 1. Drop empty samples
# 2. Retain only those ions for which AE1 was trained
# 3. Drop samples which do not have entries in Perovskite_composition_c_ions and Perovskite_composition_c_ions_coefficients
# 4. Split the Perovskite_composition_c_ions and Perovskite_composition_c_ions_coefficients columns based on delimiter
# 5. Sum up the ion coefficient columns and drop samples where the sum is not equal to 1
# 6. Insert the property features

# Cl, Br, I
c_samples_to_drop = [['O', '(SCN); I', 'Br; F; I', '(BF4); I', 'S', 'I; SCN', 'I | I'], 
                     ['x', 'x; x']]

# Step 1
for feature in features:
    perovskites_3D[feature].replace('', np.nan, inplace=True)
    perovskites_3D = perovskites_3D.dropna(subset=feature)
    
# Step 2
perovskites_3D = drop_samples(perovskites_3D, 
                              drop_samples_from=features, 
                              drop_samples=c_samples_to_drop,
                              suppress_output=False)    
    
# Step 3
perovskites_3D = perovskites_3D.dropna(subset=features)

print('\n')
print(' Dropping empty samples and specified samples.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 4
new_col_names = ['c_1', 
                 'c_2']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[0],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value='na')

print('\n')
print('Splitting the perovskite ions column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

new_col_names = ['c_1_coeff', 
                 'c_2_coeff']

perovskites_3D = split_column_based_on_delimiter(perovskites_3D,
                                                 split_col=features[1],
                                                 new_col_names=new_col_names,
                                                 delimiter=';',
                                                 fill_value=0)

print('\n')
print('Splitting the perovskite ion coeffs column based on delimiter')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 5
perovskites_3D = perovskites_3D[perovskites_3D['c_1_coeff'].astype(float)
                                + perovskites_3D['c_2_coeff'].astype(float) == float(3)]

# Step 6
insert_col_loc = list(perovskites_3D.columns).index('c_2_coeff')
ion_cols = ['c_1', 'c_2']
ion_coeff_cols = ['c_1_coeff', 'c_2_coeff']

C_site_prop_dict_list = [C_ion_radius_dict,
                         C_atomic_wt_dict, 
                         C_EA_dict,
                         C_IE_dict,
                         C_EN_dict]

perovskites_3D = insert_properties(perovskites_3D, 
                                   ion_coeff_cols=ion_coeff_cols,
                                   ion_cols=ion_cols,
                                   prop_dict_list=C_site_prop_dict_list,
                                   insert_col_loc=insert_col_loc)

# Divide all calculated properties by 3
selected_columns = ['C_ion_radius', 
                    'C_atom_wt',
                    'C_EN',
                    'C_IE',
                    'C_EA']

perovskites_3D[selected_columns] = perovskites_3D[selected_columns].apply(lambda x: x/3)

print('\n')
print('Inserted the property features.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)



 --> Dropping sample O
 --> Number of samples dropped 0


 --> Dropping sample (SCN); I
 --> Number of samples dropped 0


 --> Dropping sample Br; F; I
 --> Number of samples dropped 0


 --> Dropping sample (BF4); I
 --> Number of samples dropped 1


 --> Dropping sample S
 --> Number of samples dropped 0


 --> Dropping sample I; SCN
 --> Number of samples dropped 0


 --> Dropping sample I | I
 --> Number of samples dropped 0


 --> Dropping sample x
 --> Number of samples dropped 0


 --> Dropping sample x; x
 --> Number of samples dropped 0


 Dropping empty samples and specified samples.
 --> Perovskites 3D shape (10123, 59)


Splitting the perovskite ions column based on delimiter
 --> Perovskites 3D shape (10123, 61)


Splitting the perovskite ion coeffs column based on delimiter
 --> Perovskites 3D shape (10123, 63)


Inserted the property features.
 --> Perovskites 3D shape (9979, 68)


### Insert Cell Properties

In [10]:
dataset_dir = dataset_save_name
perovskites_3D = pd.read_csv(dataset_dir, header=0, low_memory=False)

insert_col_loc = list(perovskites_3D.columns).index('C_EN')

tf = (perovskites_3D.A_ion_radius + perovskites_3D.C_ion_radius)/((2**0.5)*(perovskites_3D.B_ion_radius + perovskites_3D.C_ion_radius))
of = perovskites_3D.B_ion_radius/perovskites_3D.C_ion_radius


perovskites_3D.insert(insert_col_loc + 1, 'Tolerance Factor', tf)
perovskites_3D.insert(insert_col_loc + 2, 'Octahedral Factor', of)

print('\n')
print('Inserted cell properties.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=True)



Inserted cell properties.
 --> Perovskites 3D shape (9979, 70)


### Select samples with the required synthesis conditions

In [11]:
#######################################################################################################
# Feature : Perovskite_deposition_number_of_deposition_steps
#######################################################################################################

# Note : Choosing 1 or two step depositions as they make up the majority of deposition techniques
# Note : Depositing the perovskite first and then crystallization is considered a 2 step process.
# Note : Every step considered to have its own thermal history.
# Note : Spin coating with antisolvent is considered as a singel step.

variable = ['Perovskite_deposition_number_of_deposition_steps']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=variable)

print('\n')
print(f'Dropped empty samples from {variable[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 2 : Keep/Drop specific samples

#perovskites_3D = perovskites_3D[(perovskites_3D[variable[0]] == 1) |
#                                (perovskites_3D[variable[0]] == 2)]

perovskites_3D = perovskites_3D[(perovskites_3D[variable[0]] == 1)]

print('\n')
print('Retained 1 step deposition techniques.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

#######################################################################################################
# Feature : Perovskite_deposition_procedure
#######################################################################################################

# Note : Thermal annealing is generally not considered as an individual reaction step.
# Note : Antisolvent treatment is considered a different step.

variable = ['Perovskite_deposition_procedure']

# Step 1 : Drop empty samples

perovskites_3D = drop_empty_samples(perovskites_3D, drop_empty_samples_from=variable)

print('\n')
print(f'Dropped empty samples from {variable[0]}.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

# Step 2 : Retain only spin coating samples

perovskites_3D = perovskites_3D[perovskites_3D['Perovskite_deposition_procedure'] == 'Spin-coating']

print('\n')
print('Retained only spin coating samples.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

#######################################################################################################
# Feature : Perovskite_deposition_procedure
#######################################################################################################

variables = ['Perovskite_deposition_quenching_induced_crystallisation']

# Convert TRUE/ FALSE to 0/1
perovskites_3D[variables[0]] = perovskites_3D[variables[0]].astype(int)

perovskites_3D = perovskites_3D[perovskites_3D[variables[0]] == 1]

print('\n')
print(f'Retained samples that were crystallized using quenching.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)



Dropped empty samples from Perovskite_deposition_number_of_deposition_steps.
 --> Perovskites 3D shape (9979, 70)


Retained 1 step deposition techniques.
 --> Perovskites 3D shape (7289, 70)


Dropped empty samples from Perovskite_deposition_procedure.
 --> Perovskites 3D shape (7289, 70)


Retained only spin coating samples.
 --> Perovskites 3D shape (6769, 70)


Retained samples that were crystallized using quenching.
 --> Perovskites 3D shape (4399, 70)


### Drop duplicates

In [12]:
# By default removes duplicate rows based on all columns

variables_to_consider = ['A_ion_radius',
                        'A_atom_wt',
                        'A_EN',
                        'A_IE',
                        'A_EA',
                        'B_ion_radius',
                        'B_atom_wt',
                        'B_EN',
                        'B_IE',
                        'B_EA',
                        'C_ion_radius',
                        'C_atom_wt',
                        'C_EN',
                        'C_IE',
                        'C_EA',
                        'Substrate_stack_sequence',
                        'ETL_stack_sequence',
                        'HTL_stack_sequence',
                        'JV_reverse_scan_PCE'
                        ]

perovskites_3D = perovskites_3D.drop_duplicates(subset=variables_to_consider)

print('\n')
print('Dropped duplicates.')
print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
perovskites_3D.to_csv(dataset_save_name, header=True, index=False)



Dropped duplicates.
 --> Perovskites 3D shape (4042, 70)


### Drop categories which have low samples

In [13]:
# Since most variables will be one hot encoded use this to reduce the dimensionality of the vector

# 47 different categories for substrate
# 400 + categories for ETL
# 500 + catgeories for HTL
# 50 categories for perovskite deposition procedure, step 1 solvent, quenching media

variables = ['ETL_stack_sequence',
             'HTL_stack_sequence']

cutoff = 100

for variable in variables:
    counts = perovskites_3D[variable].value_counts()
    print(counts)

    perovskites_3D = perovskites_3D[perovskites_3D[variable].isin(counts[counts >= cutoff].index)]

    print('\n')
    print(f' --> Perovskites 3D shape {perovskites_3D.shape}')
    perovskites_3D.to_csv(dataset_save_name, header=True, index=False)

ETL_stack_sequence
TiO2-c | TiO2-mp          1077
TiO2-c                     454
PCBM-60                    323
PCBM-60 | BCP              261
SnO2-c                     230
                          ... 
PCBM-60 | TIPD; ZnO-np       1
TIPD; ZnO-np                 1
PCBM-60; PCDTBT              1
PCBM-60 | BCP | Ga2O3        1
C60 | LiF | BCP              1
Name: count, Length: 282, dtype: int64


 --> Perovskites 3D shape (2790, 70)
HTL_stack_sequence
Spiro-MeOTAD       1397
PEDOT:PSS           325
NiO-c               176
PTAA                120
none                 58
                   ... 
M4; PCBM-60           1
Ph-TPA-2A             1
Ph-TPA-4A             1
Ph-TPA-6A             1
PEDOT:PSS | PEI       1
Name: count, Length: 333, dtype: int64


 --> Perovskites 3D shape (2018, 70)
