# ***Unveiling Band Gaps in Perovskite Oxides for Next-Gen Electronics***

Perovskite oxides are a class of materials with tunable elctronic properties, making them valuable for Solar cells, LEDs, semiconductors and Optoelectric devices. The solution aims to analyze, preprocess and train Machine Learning models to achieve accurate band gap predictions

The Machine Learning workflow:
  1. Importing the libraries
  2. Importing the dataset
  3. Feature Engineering
  4. Regression model
  5. Classification model



#Importing the Libraries

In [1]:
import numpy as np #Numpy
import pandas as pd #pandas
import matplotlib.pyplot as plt #matplotlib
import seaborn as sns #seaborn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

#The Dataset

The dataset used in making the Machine Learning model is the dataset_excavate.xlsx

In [2]:
df = pd.read_excel('dataset_excavate.xlsx') # Loading the dataset
df.head()

Unnamed: 0,functional group,A,A_OS,A',A'_OS,A_HOMO-,A_HOMO+,A_IE-,A_IE+,A_LUMO-,...,B_X+,B_Z_radii-,B_Z_radii+,B_e_affin-,B_e_affin+,PBE band gap,μ,μĀ,μ𝐵 ̅,t
0,AgBaAuCdO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,2.115,0.2225,2.4375,145.4,77.4,0.0,0.54286,0.13571,0.13571,0.92317
1,AgBaAuHgO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,2.27,0.125,2.535,135.4,87.4,0.0,0.56786,0.13571,0.16071,0.90845
2,AgBaMoCdO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,1.925,0.2525,2.4675,70.0,2.0,0.0,0.55714,0.13571,0.12143,0.9147
3,AgBiO3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,...,2.02,0.0,1.997,0.0,91.3,0.0,0.54286,0.0,0.0,0.94608
4,AgBrO3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,...,2.96,0.0,1.2,0.0,324.7,0.1524,0.44286,0.0,0.0,1.01165


#Feature Engineering

Adding the binary classifier column (insulator and non-insulator)

In [3]:
df['Is_Insulator'] = df['PBE band gap'].apply(lambda x: "Yes" if x >= 0.5 else "No")
df.head()

Unnamed: 0,functional group,A,A_OS,A',A'_OS,A_HOMO-,A_HOMO+,A_IE-,A_IE+,A_LUMO-,...,B_Z_radii-,B_Z_radii+,B_e_affin-,B_e_affin+,PBE band gap,μ,μĀ,μ𝐵 ̅,t,Is_Insulator
0,AgBaAuCdO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,0.2225,2.4375,145.4,77.4,0.0,0.54286,0.13571,0.13571,0.92317,No
1,AgBaAuHgO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,0.125,2.535,135.4,87.4,0.0,0.56786,0.13571,0.16071,0.90845,No
2,AgBaMoCdO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,0.2525,2.4675,70.0,2.0,0.0,0.55714,0.13571,0.12143,0.9147,No
3,AgBiO3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,...,0.0,1.997,0.0,91.3,0.0,0.54286,0.0,0.0,0.94608,No
4,AgBrO3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,...,0.0,1.2,0.0,324.7,0.1524,0.44286,0.0,0.0,1.01165,No


Features to Drop

In [4]:
columns_to_drop = ["functional group"] # The Functional Group column is dropped
df = df.drop(columns=columns_to_drop, errors='ignore')
df.head()

Unnamed: 0,A,A_OS,A',A'_OS,A_HOMO-,A_HOMO+,A_IE-,A_IE+,A_LUMO-,A_LUMO+,...,B_Z_radii-,B_Z_radii+,B_e_affin-,B_e_affin+,PBE band gap,μ,μĀ,μ𝐵 ̅,t,Is_Insulator
0,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,-1.3775,...,0.2225,2.4375,145.4,77.4,0.0,0.54286,0.13571,0.13571,0.92317,No
1,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,-1.3775,...,0.125,2.535,135.4,87.4,0.0,0.56786,0.13571,0.16071,0.90845,No
2,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,-1.3775,...,0.2525,2.4675,70.0,2.0,0.0,0.55714,0.13571,0.12143,0.9147,No
3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,-0.729,...,0.0,1.997,0.0,91.3,0.0,0.54286,0.0,0.0,0.94608,No
4,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,-0.729,...,0.0,1.2,0.0,324.7,0.1524,0.44286,0.0,0.0,1.01165,No


In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist() # Storing all categorical features
print(categorical_cols)

['A', "A'", 'Bi', "B'", 'Is_Insulator']


Encoding Categorical data

In [6]:
le = LabelEncoder()
df_encoded = df.copy()
df_encoded[categorical_cols] = df_encoded[categorical_cols].apply(LabelEncoder().fit_transform)

#The Classification Model

Features and Target Variable

In [7]:
X_c=df_encoded.iloc[: ,:-1].values
y_c=df_encoded.iloc[:,-1].values

Splitting the dataset into training set and test set

In [8]:
X_c_train, X_c_test, y_c_train, y_c_test = train_test_split(X_c, y_c, test_size = 0.25, random_state = 0)


#XGBoost Classifier

In [9]:
classifier = XGBClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42, max_cat_threshold=20) #craeting the XGBoost classifier model
classifier.fit(X_c_train, y_c_train) #Training the model on the training data


Parameters: { "criterion" } are not used.



Predicting the results

In [10]:
y_c_pred = classifier.predict(X_c_test)

In [12]:
y_c_pred=pd.DataFrame(y_c_pred)
y_c_pred = y_c_pred.applymap(lambda x: "Yes" if x ==1 else "No")
np.array(y_c_pred)

  y_c_pred = y_c_pred.applymap(lambda x: "Yes" if x ==1 else "No")


array([['No'],
       ['No'],
       ['No'],
       ...,
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

Accuracy metrics

In [11]:
cm = confusion_matrix(y_c_test, y_c_pred) # Evaluating Confusion Matrix
print(cm)
accuracy_score(y_c_test, y_c_pred) # Evaluating Accuracy

[[872   0]
 [  1 415]]


0.9992236024844721

#The Regression Model

Features and Target variable

In [13]:
df = pd.read_excel('dataset_excavate.xlsx') # Importing the dataset
df.head()

Unnamed: 0,functional group,A,A_OS,A',A'_OS,A_HOMO-,A_HOMO+,A_IE-,A_IE+,A_LUMO-,...,B_X+,B_Z_radii-,B_Z_radii+,B_e_affin-,B_e_affin+,PBE band gap,μ,μĀ,μ𝐵 ̅,t
0,AgBaAuCdO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,2.115,0.2225,2.4375,145.4,77.4,0.0,0.54286,0.13571,0.13571,0.92317
1,AgBaAuHgO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,2.27,0.125,2.535,135.4,87.4,0.0,0.56786,0.13571,0.16071,0.90845
2,AgBaMoCdO6,Ag,3,Ba,2,0.585,-3.82,114.05,616.95,0.6485,...,1.925,0.2525,2.4675,70.0,2.0,0.0,0.55714,0.13571,0.12143,0.9147
3,AgBiO3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,...,2.02,0.0,1.997,0.0,91.3,0.0,0.54286,0.0,0.0,0.94608
4,AgBrO3,Ag,1,Ag,1,0.0,-4.405,0.0,731.0,0.0,...,2.96,0.0,1.2,0.0,324.7,0.1524,0.44286,0.0,0.0,1.01165


Adding the binary classifier column (insulator and non-insulator)

In [14]:
df['Is_Insulator'] = df['PBE band gap'].apply(lambda x: "Yes" if x >= 0.5 else "No")

Features to drop

In [15]:
#The Functional Group column is dropped
columns_to_drop = ["functional group"]
df = df.drop(columns=columns_to_drop, errors='ignore')

In [16]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

Encoding Categorical data

In [17]:
le = LabelEncoder()
df_encoded = df.copy()
df_encoded[categorical_cols] = df_encoded[categorical_cols].apply(LabelEncoder().fit_transform)

In [18]:
df_encoded = df_encoded[df_encoded['Is_Insulator'] == 1]

In [19]:
X_r=df_encoded.drop(['PBE band gap','Is_Insulator'], axis=1)
y_r=df_encoded['PBE band gap']

Splitting the data into Training and Test set

In [20]:
X_r_train,X_r_test,y_r_train,y_r_test = train_test_split(X_r,y_r,test_size=0.2,random_state=42)

#**Random Forest Regression Model**

In [21]:
rf_model = RandomForestRegressor(n_estimators=250, random_state=42, max_depth=8) #Creating random forest regression model
rf_model.fit(X_r_train, y_r_train) #Fitting the regression model with training set.


Test prediction

In [22]:
y_r_test_pred = rf_model.predict(X_r_test)
y_r_train_pred = rf_model.predict(X_r_train)

Accuracy Metrics

In [23]:
test_r_mse = mean_squared_error(y_r_test, y_r_test_pred) # Mean Squared Error
test_r_r2 = r2_score(y_r_test, y_r_test_pred) # R-Squared Error
print(f"Test MSE: {test_r_mse}")
print(f"Test R^2: {test_r_r2}")

Test MSE: 0.15470931020003945
Test R^2: 0.7509862156174507
