# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

# Loading the data and applying feature filtering

In [2]:
#loading the data
df = pd.read_csv("2_Full Dataset (Features).csv")

#designating X and Y
mp_id = df['mp_id']
df1 = df.drop(['material', 'mp_id','phonon_band_center','counter'], axis=1)
Y = df['phonon_band_center']

#Feature filtering (variance threshold)
vt = VarianceThreshold(threshold=0)
vt.fit(df1)
vt.get_support()
concol = [column for column in df1.columns
           if column not in df1.columns[vt.get_support()]]
df2 = df1.drop(concol, axis=1)
df2 = df2.dropna()

Pearson and Spearman correlations

In [4]:
# 2. Spearman and Pearson correlations

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr(method='spearman')
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[j]  # getting the name of column
                col_corr.add(colname)
    af_corr = dataset.drop(col_corr,axis=1)
    return af_corr

def correlation2(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr(method='pearson')
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[j]  # getting the name of column
                col_corr.add(colname)
    af_corr = dataset.drop(col_corr,axis=1)
    return af_corr

In [5]:
af_both2 = correlation(df2, .8)
af_both2.shape
af_both3 = correlation2(af_both2, .8)
af_both3.shape

(1272, 72)

In [7]:
af_both3.columns

Index(['min_EffectiveCoordination', 'max_EffectiveCoordination',
       'min_MeanBondLength', 'var_BondLengthVariation',
       'min_BondLengthVariation', 'max_BondLengthVariation', 'var_CellVolume',
       'mean_WCMagnitude_Shell3', 'MaxPackingEfficiency',
       'min_NeighDiff_shell1_AtomicWeight', 'range_NeighDiff_shell1_Row',
       'range_NeighDiff_shell1_Electronegativity',
       'range_NeighDiff_shell1_NpValence', 'range_NeighDiff_shell1_NdValence',
       'range_NeighDiff_shell1_NValance', 'min_NeighDiff_shell1_NpUnfilled',
       'range_NeighDiff_shell1_NpUnfilled', 'min_NeighDiff_shell1_GSvolume_pa',
       'range_NeighDiff_shell1_GSvolume_pa',
       'range_NeighDiff_shell1_GSbandgap',
       'range_NeighDiff_shell1_SpaceGroupNumber', 'NComp', 'Comp_L10Norm',
       'max_MeltingT', 'most_MeltingT', 'mean_Column', 'dev_Column', 'dev_Row',
       'max_Row', 'dev_CovalentRadius', 'min_CovalentRadius', 'max_NpValence',
       'dev_NdValence', 'most_NfValence', 'min_NValance', '

Scaling

In [8]:
#MinMax Scaling
scaler = MinMaxScaler()
scaler.fit(af_both3)
df_scaled = scaler.transform(af_both3)
df_scaled_d = pd.DataFrame(df_scaled)
df_scaled_d.columns = af_both3.columns

# Testing 

In [9]:
#Training and testing 80%-20% split
X_train, X_test, y_train, y_test = train_test_split(df_scaled_d, Y,test_size=0.2,random_state =123)

In [10]:
#Analysis of promising algorithms using LazyPredict
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:35<00:00,  1.19it/s]


In [12]:
#Using XT as the promising model to work with
reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(X_train, y_train)
reg.score(X_test, y_test)

0.9710943398560626

10-fold CV

In [13]:
#10-fold Cross validation 
scores_r2 = []
scores_mae = []
scores_rmse = []

kf = KFold(n_splits=10, random_state=None, shuffle=False)
for i, (train_index, test_index) in enumerate(kf.split(df_scaled_d)):
    X_train,y_train  = df_scaled_d.loc[train_index],Y.loc[train_index]
    X_test,y_test = df_scaled_d.loc[test_index],Y.loc[test_index]
    reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(X_train, y_train)
    scores_r2.append(reg.score(X_test, y_test))
    scores_mae.append(mean_absolute_error(reg.predict(X_test),y_test))
    scores_rmse.append(math.sqrt(mean_squared_error(reg.predict(X_test),y_test)))

In [15]:
#Estimating R^2, MAE, RMSE of the model
r2_sum = sum(scores_r2)/10
mae_sum = sum(scores_mae)/10
rmse_sum = sum(scores_rmse)/10
print(r2_sum, mae_sum, rmse_sum)

-13.105163608204412 3.49873589664287 4.311884268187535


# Validation

In [10]:
#Validation of the model using more validation data
val= pd.read_csv() #Please add any validation data
val_reduced = val[X_train.columns]
val_scaled = scaler.transform(val_reduced)
val_scaled_d = pd.DataFrame(val_scaled)
val_scaled_d.columns = X_train.columns

In [14]:
#building the dinal mnodel
reg = ExtraTreesRegressor(n_estimators=100, random_state=96)
reg.fit(df_scaled_d,Y)
outcome = reg.predict(val_scaled_d)