In [1]:
import matminer
from matminer.datasets import get_available_datasets
get_available_datasets()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split

boltztrap_mp: Effective mass and thermoelectric properties of 8924 compounds in The  Materials Project database that are calculated by the BoltzTraP software package run on the GGA-PBE or GGA+U density functional theory calculation results. The properties are reported at the temperature of 300 Kelvin and the carrier concentration of 1e18 1/cm3.

brgoch_superhard_training: 2574 materials used for training regressors that predict shear and bulk modulus.

castelli_perovskites: 18,928 perovskites generated with ABX combinatorics, calculating gllbsc band gap and pbe structure, and also reporting absolute band edge positions and heat of formation.

citrine_thermal_conductivity: Thermal conductivity of 872 compounds measured experimentally and retrieved from Citrine database from various references. The reported values are measured at various temperatures of which 295 are at room temperature.

dielectric_constant: 1,056 structures with dielectric properties, calculated with DFPT-PBE.

double_

DATA EXTRACTION AND DATA CLEANING

In [2]:
from matminer.datasets import load_dataset
df = load_dataset('mp_nostruct_20181018')


In [3]:
df.head(2574)

Unnamed: 0,mpid,formula,e_hull,gap pbe,mu_b,elastic anisotropy,bulk modulus,shear modulus,e_form
0,mp-85,In,0.003319,0.0000,2.700000e-05,1.044699,33.154748,4.904836,0.003319
1,mp-110,Mg,0.039182,0.0000,-1.360000e-05,-11.326659,35.636106,1.830272,0.039182
2,mp-20,Be,0.108143,0.0000,1.000000e-07,8.030000,124.000000,84.000000,0.108143
3,mp-8640,Hf,0.071216,0.0000,-2.050000e-05,0.881277,101.242732,44.836516,0.071216
4,mp-674158,P,3.509988,2.0113,3.000042e+00,10.884643,0.327165,-0.064038,3.509988
...,...,...,...,...,...,...,...,...,...
2569,mp-16366,Cu2Se,0.126698,0.0889,0.000000e+00,8.701684,86.124265,7.581333,0.014137
2570,mp-11836,ErSbPd,0.000000,0.3638,0.000000e+00,0.060000,90.000000,45.000000,-1.116196
2571,mp-11839,GdSbPt,0.000000,0.3258,7.000000e+00,,,,-1.187074
2572,mp-2318,Nb2C,0.016992,0.0000,2.200000e-05,2.783098,219.283742,86.768715,-0.443253


In [4]:
df.describe()

Unnamed: 0,e_hull,gap pbe,mu_b,elastic anisotropy,bulk modulus,shear modulus,e_form
count,83989.0,83989.0,83989.0,7676.0,7676.0,7676.0,83989.0
mean,0.142806,1.181326,5.315005,3.731337,105.175987,47.925097,-1.475265
std,0.423493,1.56429,12.624827,415.327469,282.756557,88.804893,1.247408
min,0.0,0.0,-84.003218,-13432.529515,-21843.0,-3476.434485,-4.522664
25%,0.0,0.0,0.0,0.141947,45.0,18.396291,-2.489777
50%,0.027675,0.2724,0.001509,0.511829,94.202871,38.626626,-1.61335
75%,0.094054,2.1139,5.000003,1.633226,154.0,71.530068,-0.484626
max,5.892481,17.8914,279.988888,30571.028103,7960.0,3295.183506,4.828697


In [5]:
df.shape

(83989, 9)

In [6]:
df.columns

Index(['mpid', 'formula', 'e_hull', 'gap pbe', 'mu_b', 'elastic anisotropy',
       'bulk modulus', 'shear modulus', 'e_form'],
      dtype='object')

In [7]:
df=df.drop(['mpid', 'e_hull', 'mu_b', 'elastic anisotropy',
 'e_form'],axis=1)
df


Unnamed: 0,formula,gap pbe,bulk modulus,shear modulus
0,In,0.0000,33.154748,4.904836
1,Mg,0.0000,35.636106,1.830272
2,Be,0.0000,124.000000,84.000000
3,Hf,0.0000,101.242732,44.836516
4,P,2.0113,0.327165,-0.064038
...,...,...,...,...
83984,Sr3(GaO3)2,3.5262,,
83985,Sr3Al2O6,4.2046,,
83986,MgSb2(H4O3)6,3.2827,,
83987,GaH18C3(N3F2)3,4.9759,,


In [8]:
df.describe()

Unnamed: 0,gap pbe,bulk modulus,shear modulus
count,83989.0,7676.0,7676.0
mean,1.181326,105.175987,47.925097
std,1.56429,282.756557,88.804893
min,0.0,-21843.0,-3476.434485
25%,0.0,45.0,18.396291
50%,0.2724,94.202871,38.626626
75%,2.1139,154.0,71.530068
max,17.8914,7960.0,3295.183506


In [9]:
df.to_csv('C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/bulk-modulus.csv')
df.to_csv('C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/mp_nostruct_20181018.csv')

In [10]:
PATH = os.getcwd()
data_path = os.path.join(PATH,'C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/bulk-modulus.csv')

df = pd.read_csv(data_path)
print(f'Original DataFrame shape: {df.shape}')
df.drop(df.columns[df.columns.str.contains('Unnamed: 3',case=False)],axis=1,inplace=True)
df.drop(df.columns[df.columns.str.contains('Unnamed: 4',case=False)],axis=1,inplace=True)

Original DataFrame shape: (83989, 5)


In [11]:
df.drop(df.columns[df.columns.str.contains('Unnamed: 0',case=False)],axis=1,inplace=True)
df

Unnamed: 0,formula,gap pbe,bulk modulus,shear modulus
0,In,0.0000,33.154748,4.904836
1,Mg,0.0000,35.636106,1.830272
2,Be,0.0000,124.000000,84.000000
3,Hf,0.0000,101.242732,44.836516
4,P,2.0113,0.327165,-0.064038
...,...,...,...,...
83984,Sr3(GaO3)2,3.5262,,
83985,Sr3Al2O6,4.2046,,
83986,MgSb2(H4O3)6,3.2827,,
83987,GaH18C3(N3F2)3,4.9759,,


In [12]:
df2 = df.copy()

In [13]:
bool_nans_formula = df2['formula'].isnull()
bool_nans_T = df2['shear modulus'].isnull()
bool_nans_Cp = df2['bulk modulus'].isnull()
bool_nans_ev = df2['gap pbe'].isnull()

In [14]:
df2 = df2.drop(df2.loc[bool_nans_formula].index, axis=0)
df2 = df2.drop(df2.loc[bool_nans_T].index, axis=0)
df2 = df2.drop(df2.loc[bool_nans_Cp].index, axis=0)
df2 = df2.drop(df2.loc[bool_nans_ev].index, axis=0)


print(f'DataFrame shape before dropping NaNs: {df.shape}')
print(f'DataFrame shape after dropping NaNs: {df2.shape}')

DataFrame shape before dropping NaNs: (83989, 4)
DataFrame shape after dropping NaNs: (7676, 4)


In [15]:
df3 = df.copy()
df3 = df3.dropna(axis=0, how='any')

print(f'DataFrame shape before dropping NaNs: {df.shape}')
print(f'DataFrame shape after dropping NaNs: {df3.shape}')

df = df3.copy()

DataFrame shape before dropping NaNs: (83989, 4)
DataFrame shape after dropping NaNs: (7676, 4)


In [16]:
bool_invalid_T = df['shear modulus'] < 0
bool_invalid_Cp = df['bulk modulus'] < 0

df = df.drop(df.loc[bool_invalid_T].index, axis=0)
df = df.drop(df.loc[bool_invalid_Cp].index, axis=0)

print(f'Cleaned DataFrame shape: {df.shape}')

Cleaned DataFrame shape: (7416, 4)


In [17]:
out_path = os.path.join(PATH, 'C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/cleansed.csv')
df.to_csv(out_path, index=False)

In [18]:
df = df.drop(['gap pbe'],axis=1)

In [19]:
df.describe()

Unnamed: 0,bulk modulus,shear modulus
count,7416.0,7416.0
mean,107.448058,52.4352
std,89.673519,61.316122
min,0.072811,0.0
25%,45.75101,20.0
50%,94.979752,40.194874
75%,154.762653,72.613885
max,2992.0,3295.183506


DATA VISUALIZATION

In [20]:
import matminer
from figrecipes import PlotlyFig
from matminer.datasets import load_dataset
df = pd.read_csv('cleansed.csv')
pf = PlotlyFig(df, y_title='Bulk Modulus (GPa)', x_title='Shear Modulus (GPa)', filename='bulk_shear_moduli')
pf.xy(('shear modulus','bulk modulus'), labels= 'formula', colorscale='Picnic',)

  colorbar_title = pd.Series(colorbar).name


SPLITTING DATA INTO TRAINING,VALIDATION AND TEST DATA SET

In [21]:
# Set a random seed to ensure reproducibility across runs
RNG_SEED = 42
np.random.seed(seed=RNG_SEED)

In [22]:
PATH = os.getcwd()
data_path = os.path.join(PATH, 'C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/CLEANSED.csv')

df = pd.read_csv(data_path)
print(f'Full DataFrame shape: {df.shape}')

Full DataFrame shape: (7416, 4)


In [23]:
X = df[['formula', 'shear modulus']]
y = df['bulk modulus']

print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

Shape of X: (7416, 2)
Shape of y: (7416,)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RNG_SEED)

print(X_train.shape)
print(X_test.shape)

(5932, 2)
(1484, 2)


In [25]:
num_rows = len(X_train)
print(f'There are in total {num_rows} rows in the X_train DataFrame.')

num_unique_formulae = len(X_train['formula'].unique())
print(f'But there are only {num_unique_formulae} unique formulae!\n')

print('Unique formulae and their number of occurances in the X_train DataFrame:')
print(X_train['formula'].value_counts(), '\n')
print('Unique formulae and their number of occurances in the X_test DataFrame:')
print(X_test['formula'].value_counts())

There are in total 5932 rows in the X_train DataFrame.
But there are only 5108 unique formulae!

Unique formulae and their number of occurances in the X_train DataFrame:
Al2O3      20
SiO2       17
C          15
LiVF4      15
SiC        12
           ..
Na2TlBi     1
CaZrS3      1
PbCO3       1
SbF3        1
AcMg3       1
Name: formula, Length: 5108, dtype: int64 

Unique formulae and their number of occurances in the X_test DataFrame:
WO3         5
SiC         5
Si          5
TiO2        4
C           4
           ..
Na3SbS4     1
Na2Pd3O4    1
SrBi3       1
Pd4S        1
ScInIr2     1
Name: formula, Length: 1408, dtype: int64


In [26]:
unique_formulae = X['formula'].unique()
print(f'{len(unique_formulae)} unique formulae:\n{unique_formulae}')

6254 unique formulae:
['In' 'Mg' 'Be' ... 'Mg41Al67' 'Mg2Au' 'Li12Si7']


In [27]:
# Set a random seed to ensure reproducibility across runs
np.random.seed(seed=RNG_SEED)

# Store a list of all unique formulae
all_formulae = unique_formulae.copy()

# Define the proportional size of the dataset split
val_size = 0.20
test_size = 0.10
train_size = 1 - val_size - test_size

# Calculate the number of samples in each dataset split
num_val_samples = int(round(val_size * len(unique_formulae)))
num_test_samples = int(round(test_size * len(unique_formulae)))
num_train_samples = int(round((1 - val_size - test_size) * len(unique_formulae)))

# Randomly choose the formulate for the validation dataset, and remove those from the unique formulae list
val_formulae = np.random.choice(all_formulae, size=num_val_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in val_formulae]

# Randomly choose the formulate for the test dataset, and remove those from the unique formulae list
test_formulae = np.random.choice(all_formulae, size=num_test_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in test_formulae]

# The remaining formulae will be used for the training dataset
train_formulae = all_formulae.copy()

print('Number of training formulae:', len(train_formulae))
print('Number of validation formulae:', len(val_formulae))
print('Number of testing formulae:', len(test_formulae))

Number of training formulae: 4378
Number of validation formulae: 1251
Number of testing formulae: 625


In [28]:
# Split the original dataset into the train/validation/test datasets using the formulae lists above
df_train = df[df['formula'].isin(train_formulae)]
df_val = df[df['formula'].isin(val_formulae)]
df_test = df[df['formula'].isin(test_formulae)]

print(f'train dataset shape: {df_train.shape}')
print(f'validation dataset shape: {df_val.shape}')
print(f'test dataset shape: {df_test.shape}\n')

print(df_train.head(), '\n')
print(df_val.head(), '\n')
print(df_test.head(), '\n')

train dataset shape: (5190, 4)
validation dataset shape: (1501, 4)
test dataset shape: (725, 4)

  formula  gap pbe  bulk modulus  shear modulus
0      In      0.0     33.154748       4.904836
1      Mg      0.0     35.636106       1.830272
2      Be      0.0    124.000000      84.000000
3      Hf      0.0    101.242732      44.836516
4      Sr      0.0     11.543947       6.232662 

   formula  gap pbe  bulk modulus  shear modulus
8       Cu   0.0000    145.872296      19.068956
15      Hg   0.0000      8.500236       2.913088
16      Ru   0.0282    309.000000     186.000000
18      Na   0.0000      7.845522      11.072713
20      H2   7.3713      0.072811       0.009392 

   formula  gap pbe  bulk modulus  shear modulus
21      Sm   0.0123     37.000000      23.000000
39      Mo   0.0000    262.389753     124.416552
53      Os   0.0134    408.000000     231.000000
59      Yb   0.0000     15.397697       9.405890
69       V   0.0000    179.460907      30.499982 



In [29]:
train_formulae = set(df_train['formula'].unique())
val_formulae = set(df_val['formula'].unique())
test_formulae = set(df_test['formula'].unique())

common_formulae1 = train_formulae.intersection(test_formulae)
common_formulae2 = train_formulae.intersection(val_formulae)
common_formulae3 = test_formulae.intersection(val_formulae)

print(f'# of common formulae in intersection 1: {len(common_formulae1)}; common formulae: {common_formulae1}')
print(f'# of common formulae in intersection 2: {len(common_formulae2)}; common formulae: {common_formulae2}')
print(f'# of common formulae in intersection 3: {len(common_formulae3)}; common formulae: {common_formulae3}')

# of common formulae in intersection 1: 0; common formulae: set()
# of common formulae in intersection 2: 0; common formulae: set()
# of common formulae in intersection 3: 0; common formulae: set()


In [30]:
# saving these splits into csv files
PATH = os.getcwd()

train_path = os.path.join(PATH,'C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/cp_train.csv')
val_path = os.path.join(PATH, 'C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/cp_val.csv')
test_path = os.path.join(PATH,'C:/Users/M PRAVEEN/OneDrive/AMRITA/steel/cp_test.csv')

df_train.to_csv(train_path, index=False)
df_val.to_csv(val_path, index=False)
df_test.to_csv(test_path, index=False)

In [31]:
df_train
df=df_train.drop(['gap pbe'],axis=1)
df


Unnamed: 0,formula,bulk modulus,shear modulus
0,In,33.154748,4.904836
1,Mg,35.636106,1.830272
2,Be,124.000000,84.000000
3,Hf,101.242732,44.836516
4,Sr,11.543947,6.232662
...,...,...,...
7410,Li7TaN4,82.822195,65.621100
7411,MgZn,51.737266,11.896905
7412,Mg41Al67,55.179580,22.661890
7414,C,118.462873,91.024121


In [32]:
df_val
df=df_val.drop(['gap pbe'],axis=1)
df


Unnamed: 0,formula,bulk modulus,shear modulus
8,Cu,145.872296,19.068956
15,Hg,8.500236,2.913088
16,Ru,309.000000,186.000000
18,Na,7.845522,11.072713
20,H2,0.072811,0.009392
...,...,...,...
7396,Al2O3,196.901837,110.851875
7400,Mn2O3,150.114770,51.170921
7406,Li7SbN4,74.893103,63.824312
7408,Li7VN4,83.227453,72.213241


In [33]:
df_test
df=df_test.drop(['gap pbe'],axis=1)
df

Unnamed: 0,formula,bulk modulus,shear modulus
21,Sm,37.000000,23.000000
39,Mo,262.389753,124.416552
53,Os,408.000000,231.000000
59,Yb,15.397697,9.405890
69,V,179.460907,30.499982
...,...,...,...
7384,CaSiO3,92.000000,47.000000
7386,Y4Al2O9,132.290151,64.320021
7389,Li(BH)5,16.000000,9.000000
7391,Cr5O12,58.922364,41.358856
