In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error

In [9]:
def get_raw_df():
    return pd.read_csv(r"C:\Users\91910\Desktop\IIT_Kanpur\Semester6\MSE643A\Project\bandGap\dielectric_constant_data.csv")

def get_propsFromStruct(df): #returns len, angles and periodic properties of a crystal structure for all compounds
    structure_arr = []

    for struct in df['structure']:
        finalStruct_arr = []
        modLen_arr = []
        modAngl_arr = []
        #modPerio_arr = []
        split_arr = struct.split('\n')
        len_arr = split_arr[2].split()[2:]
        angl_arr = split_arr[3].split()[1:]
        #perio_arr = split_arr[4].split()[2:]
        for i in range(3):
            modLen_arr.append(float(len_arr[i]))
            modAngl_arr.append(float(angl_arr[i]))
        for ele in modLen_arr:
            finalStruct_arr.append(ele)
        for ele in modAngl_arr:
            finalStruct_arr.append(ele)
#         for ele in perio_arr:
#             finalStruct_arr.append(ele)
        structure_arr.append(finalStruct_arr)
        
    return np.array(structure_arr)

def transform_raw_e_electronic(df): #transforms e_electronic into model readable values
    res = []
    for tensor_per_struct in df['e_electronic']:
        tensor_split = tensor_per_struct[2:-2].split(',')
        for i in range(len(tensor_split)):
            if (i==2 or i==5):
                tensor_split[i] = tensor_split[i].strip()[:-1]
            elif (i==3 or i==6):
                tensor_split[i] = tensor_split[i].strip()[1:]
        for j in range(len(tensor_split)):
            tensor_split[j] = float(tensor_split[j])
        res.append(tensor_split)
    return np.array(res)

def transform_raw_e_total(df): #transforms e_total into model readable values
    res = []
    for tensor_per_struct in df['e_total']:
        tensor_split = tensor_per_struct[2:-2].split(',')
        for i in range(len(tensor_split)):
            if (i==2 or i==5):
                tensor_split[i] = tensor_split[i].strip()[:-1]
            elif (i==3 or i==6):
                tensor_split[i] = tensor_split[i].strip()[1:]
        for j in range(len(tensor_split)):
            tensor_split[j] = float(tensor_split[j])
        res.append(tensor_split)
    return np.array(res)

def get_mod_df():
    raw_df = get_raw_df()
    mod_df = raw_df.copy()
    mod_df.drop(raw_df.columns[[0,1,2,6,8,9,14,15,16]], inplace= True, axis= 1)
    transformed_e_electronic = transform_raw_e_electronic(raw_df)
    transformed_e_total = transform_raw_e_total(raw_df)
    propsFromStruct_arr = get_propsFromStruct(raw_df)
    for i in range(9):
        mod_df['e_e_' + str(i)] = transformed_e_electronic[:, i]
    for i in range(9):
        mod_df['e_t_' + str(i)] = transformed_e_total[:, i]
    mod_df['len_a'] = propsFromStruct_arr[:,0]
    mod_df['len_b'] = propsFromStruct_arr[:,1]
    mod_df['len_c'] = propsFromStruct_arr[:,2]
    mod_df['angl_alpha'] = propsFromStruct_arr[:,3]
    mod_df['angl_beta'] = propsFromStruct_arr[:,4]
    mod_df['angl_gamma'] = propsFromStruct_arr[:,5]
    #below are excluded as they are true for all the compounds
#     mod_df['prd_p'] = propsFromStruct_arr[:,6]
#     mod_df['prd_b'] = propsFromStruct_arr[:,7]
#     mod_df['prd_c'] = propsFromStruct_arr[:,8]
    return mod_df

def transform_boolData(df, col_name): #1d array must be given
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name])
    
def scale_features(df):
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df)
    return scaled_df
    
def normal_split(df, test_size, shuffle):
    X = df.drop('band_gap', axis=1)
    y = df['band_gap']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=shuffle)
    
    return (X_train, X_test, y_train, y_test)

def evaluate_metrics(y_test, y_preds):
    r2 = r2_score(y_test, y_preds)
    mae = mean_absolute_error(y_test, y_preds)
    evs = explained_variance_score(y_test, y_preds)

    print('r2 score is ', r2)
    print('mean absolute error is ', mae)
    print('explained variance score is ', evs)

In [4]:
mod_df = get_mod_df()

mod_df

Unnamed: 0,nsites,space_group,volume,band_gap,n,poly_electronic,poly_total,pot_ferroelectric,e_e_0,e_e_1,...,e_t_5,e_t_6,e_t_7,e_t_8,len_a,len_b,len_c,angl_alpha,angl_beta,angl_gamma
0,3,225,159.501208,1.88,1.86,3.44,6.23,False,3.441158,-3.097000e-05,...,2.481000e-05,-0.000095,2.175000e-05,6.235207,6.087322,6.087322,6.087323,60.000003,60.000002,59.999999
1,3,166,84.298097,3.52,1.78,3.16,6.73,False,3.346884,-4.498543e-02,...,-9.046643e-01,-1.463589,-9.046600e-01,3.945366,6.710024,6.710024,6.710023,34.019895,34.019897,34.019896
2,3,164,108.335875,1.17,2.23,4.97,10.64,False,5.543085,-5.280000e-06,...,4.435100e-04,0.000123,4.420300e-04,4.315681,4.158086,4.158086,7.235270,90.000000,90.000000,120.000001
3,4,186,88.162562,1.12,2.65,7.04,17.99,False,7.093167,7.990000e-06,...,2.064269e-02,-0.008708,1.761772e-02,20.396643,4.132865,4.132865,5.960067,90.000000,90.000000,119.999995
4,6,136,82.826401,2.87,1.53,2.35,7.12,False,2.423962,7.452000e-05,...,1.344158e-02,0.001279,1.441676e-02,7.459124,3.354588,4.968952,4.968952,90.000000,90.000000,90.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,7,111,212.493121,0.87,2.77,7.67,11.76,True,7.748968,0.000000e+00,...,0.000000e+00,0.000000,0.000000e+00,11.562826,5.912075,5.912075,6.079460,90.000000,90.000000,90.000000
1052,8,194,220.041363,3.60,2.00,3.99,7.08,True,4.405044,6.100000e-07,...,-1.000000e-08,0.000000,-1.000000e-08,3.696193,4.137833,4.137856,14.839681,90.000000,90.000000,119.999810
1053,4,216,73.882306,0.14,14.58,212.61,232.60,True,212.607502,-1.843000e-05,...,1.830120e-03,0.002588,1.830120e-03,232.596394,4.709974,4.709973,4.709974,59.999995,59.999999,60.000000
1054,5,221,177.269065,0.21,2.53,6.41,22.44,True,6.405117,0.000000e+00,...,0.000000e+00,0.000000,0.000000e+00,22.438270,5.617516,5.617516,5.617516,90.000000,90.000000,90.000000


In [7]:
X_train, X_test, y_train, y_test = normal_split(mod_df, 0.15, True)

X_train

Unnamed: 0,nsites,space_group,volume,n,poly_electronic,poly_total,pot_ferroelectric,e_e_0,e_e_1,e_e_2,...,e_t_5,e_t_6,e_t_7,e_t_8,len_a,len_b,len_c,angl_alpha,angl_beta,angl_gamma
12,3,225,69.537895,2.23,4.97,8.42,False,4.967592,0.000034,-0.000012,...,-1.039000e-04,-1.559700e-04,-1.038100e-04,8.423227,4.615785,4.615784,4.615784,60.000001,60.000005,60.000003
307,12,189,343.822654,2.73,7.48,12.79,False,6.460945,0.000078,-0.000043,...,-1.641700e-04,-3.729000e-05,-1.133700e-04,16.965765,8.100385,8.100386,6.050517,90.000000,90.000000,120.000000
371,14,15,191.599724,1.65,2.71,5.99,False,2.687373,-0.100676,-0.011132,...,7.393600e-01,6.748203e-01,7.393430e-01,6.151129,5.534123,5.534123,7.312596,68.035217,68.035221,87.064735
914,7,82,189.718519,2.73,7.44,11.06,True,7.430698,0.013192,0.022335,...,7.963423e-02,6.357474e-02,7.962866e-02,11.122676,5.875322,5.875322,6.889534,115.238976,115.238979,89.999999
290,8,176,349.831956,1.49,2.22,2.37,True,2.404893,-0.000066,-0.000035,...,1.439000e-05,-3.330000e-06,8.720000e-06,1.853091,7.000120,7.000121,8.243618,90.000000,90.000000,120.000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,8,194,353.715043,2.84,8.09,13.83,False,7.718620,-0.000011,-0.000034,...,-1.217060e-03,-2.653100e-04,-1.092290e-03,14.939210,6.127727,6.127727,10.877375,90.000000,90.000000,120.000005
466,4,166,54.633956,2.15,4.62,10.55,True,4.970524,0.000000,0.000000,...,4.730000e-06,-1.000000e-08,4.730000e-06,8.637382,6.529411,6.529411,6.529411,28.121970,28.121970,28.121968
121,5,217,87.578887,1.28,1.63,2.17,False,1.630283,0.000035,0.000021,...,-1.544160e-03,-2.505900e-04,-1.514870e-03,2.168502,4.845521,4.845520,4.845520,109.471215,109.471213,109.471224
1044,4,166,139.017820,2.26,5.12,13.09,True,5.473830,-0.056863,-0.338594,...,-1.057886e+00,-1.743245e+00,-1.057886e+00,9.109768,8.403218,8.403219,8.403218,30.925733,30.925731,30.925728


In [10]:
scaled_X_train = scale_features(X_train)
scaled_X_test = scale_features(X_test)
scaled_y_train = scale_features(np.array(y_train).reshape(len(y_train),1))
scaled_y_test = scale_features(np.array(y_test).reshape(len(y_test),1))

scaled_X_train

array([[-1.34077143,  1.21400756, -0.98760568, ..., -0.94192141,
        -0.9435328 , -0.94595742],
       [ 1.31283185,  0.67472327,  1.80302917, ...,  0.36021087,
         0.37764954,  1.28606691],
       [ 1.90252146, -1.93181747,  0.25427882, ..., -0.59315759,
        -0.58966656,  0.06086163],
       ...,
       [-0.75108182,  1.09416661, -0.80405257, ...,  1.20534748,
         1.23515044,  0.89439215],
       [-1.04592662,  0.33018053, -0.28070129, ..., -2.20387287,
        -2.22394692, -2.02753229],
       [-0.45623701, -1.39253317,  0.67848089, ...,  0.36021087,
         0.37764954,  0.17005469]])

In [13]:
pca = PCA(0.95)
X_train_pca = pca.fit_transform(scaled_X_train)
X_test_pca = pca.transform(scaled_X_test)

X_train_pca_df = pd.DataFrame(X_train_pca)
X_test_pca_df = pd.DataFrame(X_test_pca)

X_train_pca_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.595446,-0.012375,1.721708,-1.867611,0.982228,-0.071945,1.185256,-0.073025,0.374253,-0.468896,-0.599953,0.893499,0.103646,-0.047398
1,-0.185910,-0.234598,-0.352385,2.329739,-1.424783,0.097726,0.311534,-0.736688,-0.683869,-0.249724,-1.590697,1.627538,-0.198436,0.214654
2,-1.354690,-3.126489,0.170450,1.877449,0.212134,-0.331521,3.777242,-1.102683,-0.120430,2.171017,1.409099,-0.859238,-0.012029,2.493842
3,-0.082840,-0.781690,-1.577090,0.371034,-0.313362,-0.153453,-0.466647,0.004492,-1.175879,-0.290152,0.358196,-0.518664,0.739889,-0.099329
4,-1.537844,-0.250397,-0.685715,1.361197,-0.955432,-0.038558,-1.082050,0.534714,-0.677695,0.796498,-0.701204,0.686737,0.379184,0.010809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,0.131571,-0.296555,-0.955236,1.553933,-0.871807,0.142892,0.374766,0.253741,0.883011,0.351715,-1.821087,0.946544,0.765862,-0.073324
893,-0.755928,0.360693,4.064398,-0.939980,0.277562,-0.043465,0.031688,0.463353,0.236465,0.087652,0.755843,-0.716375,0.086580,-0.184691
894,-1.432168,-0.420759,-1.462606,-1.328309,0.712814,-0.149853,0.579222,-0.199615,-0.455606,-0.334723,-0.925238,1.793070,0.449111,0.069082
895,-0.711381,5.728354,3.704863,1.001086,0.514875,-1.096504,-1.236766,0.885557,0.607122,-1.057274,0.296485,-0.302939,0.663550,0.456189


In [17]:
regr1 = svm.SVR()
regr1.fit(X_train_pca_df, scaled_y_train)
y_preds1 = regr1.predict(X_test_pca_df)

evaluate_metrics(scaled_y_test, y_preds1)

r2 score is  0.6605718703955389
mean absolute error is  0.449461067137443
explained variance score is  0.6823043988347357


  y = column_or_1d(y, warn=True)


In [18]:
regr2 = DecisionTreeRegressor()
regr2.fit(X_train_pca_df, scaled_y_train)
y_preds2 = regr2.predict(X_test_pca_df)

evaluate_metrics(scaled_y_test, y_preds2)

r2 score is  -0.5690162730919621
mean absolute error is  0.8333704605213927
explained variance score is  -0.3408476448266773


In [19]:
regr3 = RandomForestRegressor()
regr3.fit(X_train_pca_df, scaled_y_train)
y_preds3 = regr3.predict(X_test_pca_df)

evaluate_metrics(scaled_y_test, y_preds3)

  regr3.fit(X_train_pca_df, scaled_y_train)


r2 score is  0.2276816909548145
mean absolute error is  0.6195404331511847
explained variance score is  0.36696252889202197


In [20]:
regr4 = GradientBoostingRegressor()
regr4.fit(X_train_pca_df, scaled_y_train)
y_preds4 = regr4.predict(X_test_pca_df)

evaluate_metrics(scaled_y_test, y_preds4)

  y = column_or_1d(y, warn=True)


r2 score is  0.1557141287667173
mean absolute error is  0.6404294904022528
explained variance score is  0.33733670748338684


In [21]:
regr5 = KNeighborsRegressor()
regr5.fit(X_train_pca_df, scaled_y_train)
y_preds5 = regr5.predict(X_test_pca_df)

evaluate_metrics(scaled_y_test, y_preds5)

r2 score is  0.5326481347685033
mean absolute error is  0.4920216961891385
explained variance score is  0.5358406155541872
