**Q 1.** Using the Jupyter Notebook of Tutorial 18, develop a linear model between ”Atomic Mass” and ”Youngs
Modulus”

**Q 2.** Using the Jupyter Notebook of Tutorial 18 develop a linear model between ”CTE” and ”Youngs Modulus”

**Q 3.** Find how to develop a multilinear regression model and apply such that ”Atomic Mass”, ”CTE”, ”Melting
Temperature”, ”Lattice Constant” and ”Specific Heat” are the ’X’ vectors and ”Youngs Modulus” is the
Y vector. For this problem, show training, test prediction, and combined prediction separately.

Your Jupyter Notebook must show various decorated plots as appropriate for clearly understanding the
problem setup, methodology and model outcome.


In [95]:
import numpy as np
import pymatgen.core as pymat # note that import pymatgen will work but will not find any classes.
import mendeleev as mendel
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from random import shuffle
import matplotlib.pyplot as plt

In [96]:
fcc_elements = ["Ag", "Al", "Au", "Cu", "Ir", "Ni", "Pb", "Pd", "Pt", "Rh", "Th", "Yb"]
bcc_elements = ["Ba", "Cr", "Eu", "Fe", "Li", "Mn", "Mo", "Na", "Nb", "Ta", "V", "W" ]
hcp_elements = ["Be", "Ca", "Cd", "Co", "Dy", "Er", "Gd", "Hf", "Ho", "Lu", "Mg", "Re", 
                "Ru", "Sc", "Tb", "Ti", "Tl", "Tm", "Y", "Zn", "Zr"]
others = ["Sb", "Sm", "Bi", "Ce", "Sn", "Si"]
# Others (Solids): "Sb", "Sm", Bi" and "As" are Rhombohedral; "C" , "Ce" and "Sn" are Allotropic; 
# "Si" and "Ge" are Face-centered diamond-cubic;

elements = fcc_elements + bcc_elements + hcp_elements + others

shuffle(elements) 

data_atomic_mass = []  #1
data_youngs_modulus = [] #2
data_CTE = [] #3

data_lattice_constant = [] #4
data_melting_point = []  #5
data_specific_heat = [] #6


for item in elements:
    
    data_youngs_modulus.append(pymat.Element(item).youngs_modulus)
    data_lattice_constant.append(mendel.element(item).lattice_constant)
    data_melting_point.append(mendel.element(item).melting_point)
    data_specific_heat.append(mendel.element(item).specific_heat)
    data_atomic_mass.append(pymat.Element(item).atomic_mass)
    data_CTE.append(pymat.Element(item).coefficient_of_linear_thermal_expansion)

## For the linear regression between the young's modulus and atomic mass

In [94]:

print('Youngs Modulus')
print(data_youngs_modulus)
print('\n')

print('Atomic Mass')
print(data_atomic_mass)

Youngs Modulus
[69.0, 279.0, 79.0, 68.0, 8.0, 78.0, 528.0, 65.0, 24.0, 4.9, 34.0, 55.0, 74.0, 45.0, 168.0, 130.0, 186.0, 105.0, 18.0, 50.0, 55.0, 83.0, 50.0, 47.0, 329.0, 121.0, 287.0, 32.0, 50.0, 200.0, 463.0, 70.0, 56.0, 70.0, 13.0, 64.0, 209.0, 198.0, 74.0, 10.0, 211.0, 16.0, 447.0, 411.0, 108.0, 116.0, 61.0, 275.0, 20.0, 128.0, 78.0]


Atomic Mass
[174.967, 51.9961, 232.03806, 91.224, 204.3833, 178.49, 192.217, 164.93032, 173.04, 6.941, 140.116, 157.25, 44.955912, 24.305, 195.084, 63.546, 180.94788, 92.90638, 151.964, 150.36, 121.76, 107.8682, 112.411, 28.0855, 95.94, 106.42, 9.012182, 208.9804, 118.71, 58.6934, 186.207, 167.259, 158.92535, 26.9815386, 137.327, 88.90585, 58.933195, 54.938045, 168.93421, 22.98976928, 55.845, 207.2, 101.07, 183.84, 65.409, 47.867, 162.5, 102.9055, 40.078, 50.9415, 196.966569]


In [52]:
# We will divide the test and train data for x = atomic mass and y = Young's modulus
young_train = data_youngs_modulus[:45]    #dividing the data based on 90%train size
young_test = data_youngs_modulus[-6:]     #and 10% test size
# y = young's modulus
young_train = np.array(young_train).reshape(-1,1)
young_test = np.array(young_test).reshape(-1,1)  #creating array columnwise


mass_train = data_atomic_mass[:45]
mass_test = data_atomic_mass[-6:]
# x = atomic mass

mass_train = np.array(mass_train).reshape(-1,1)
mass_test = np.array(mass_test).reshape(-1,1)

In [83]:
def regression(x_train, x_test, y_train, y_test):
    
    # Define the model and train it
    model = linear_model.LinearRegression()
    # fit (or train !) the model using the training data.
    model.fit(x_train, y_train)
    
    #Join train + test data 
    full_x = np.concatenate((x_train, x_test), axis=0)
    full_y = np.concatenate((y_train, y_test), axis=0)
    
    # Use the model to predict the entire set of data
    predictions = model.predict(full_x) # Make it for all values
    
    # Print model and mean squared error and variance score
    print("Linear Equation: %.4e X + (%.4e)"%(model.coef_, model.intercept_))
    print("Mean squared error: %.4e" % (mean_squared_error(full_y, predictions)))
    print('Variance score: %.4f' % r2_score(full_y, predictions))    
    
    return predictions

In [84]:
import plotly 
import plotly.graph_objs as go 
from plotly.offline import iplot 

plotly.offline.init_notebook_mode(connected=True)

def plot(x_train, x_test, y_train, y_test, x_label, y_label, predictions):
    
    
    x_train = x_train.reshape(1,-1).tolist()[0]
    x_test = x_test.reshape(1,-1).tolist()[0]
    y_train = y_train.reshape(1,-1).tolist()[0]
    y_test = y_test.reshape(1,-1).tolist()[0]    
    predictions = predictions.reshape(1,-1).tolist()[0]
    full_x_list = x_train + x_test

    
    layout0= go.Layout(hovermode= 'closest', width = 800, height=600, showlegend=True, 
    xaxis= dict(title=go.layout.xaxis.Title(text=x_label, font=dict(size=24)), zeroline= False, gridwidth= 1, tickfont=dict(size=18)), 
    yaxis= dict(title=go.layout.yaxis.Title(text=y_label, font=dict(size=24)), zeroline= False, gridwidth= 1, tickfont=dict(size=18)), 
    legend=dict(font=dict(size=24))) # Adding a legend
    

    training = go.Scatter(x = x_train, y = y_train, mode = 'markers', 
                          marker= dict(size= 10, color= 'green'), name= "Training Data") 
    
    actual = go.Scatter(x = x_test, y = y_test, mode = 'markers', 
                        marker= dict(size= 10, color= 'red'), name= "Testing Data") 

    prediction = go.Scatter(x = full_x_list, y = predictions, mode = 'lines', 
                            line = dict(color = "blue", width = 1.5),name= "Model") 
    

    data = [training, actual, prediction]
    fig= go.Figure(data, layout=layout0)
    iplot(fig)

In [85]:
# We will divide the test and train data for x = atomic mass and y = Young's modulus
young_train = data_youngs_modulus[:45]    #dividing the data based on 90%train size
young_test = data_youngs_modulus[-6:]     #and 10% test size
# y = young's modulus
young_train = np.array(young_train).reshape(-1,1)
young_test = np.array(young_test).reshape(-1,1)  #creating array columnwise


mass_train = data_atomic_mass[:45]
mass_test = data_atomic_mass[-6:]
# x = atomic mass

mass_train = np.array(mass_train).reshape(-1,1)
mass_test = np.array(mass_test).reshape(-1,1)

In [86]:
predictions = regression(mass_train, mass_test, young_train, young_test) 

plot(mass_train, mass_test, young_train, young_test, "Atomic Mass", "Young's Modulus (GPa)", predictions) 
# A very low R2 score here the score is - ve which signifys that the variables are highly uncorrelated and dont have linear relationship

Linear Equation: 8.1959e-02 X + (1.1557e+02)
Mean squared error: 1.5787e+04
Variance score: -0.0007


# Linear model between CTE and young's modulus
$$ y = \beta{_0} + \beta{_1}x$$

In [87]:
## Now for the linear model between CTE and young's modulus
print('CTE')
print(data_CTE)

CTE
[9.4e-06, 4.5e-06, 2.89e-05, 8.2e-06, 1.1e-05, 6.3e-06, 1.22e-05, 9.9e-06, 2.99e-05, 2.06e-05, 1.18e-05, 1.34e-05, 1.18e-05, 2.17e-05, 4.9e-06, 6.3e-06, 1.33e-05, 1.27e-05, 3.02e-05, 2.6e-06, 1.1e-05, 9.9e-06, 1.12e-05, 1.06e-05, 1.65e-05, 3.5e-05, 8.8e-06, 6.2e-06, 8.6e-06, 2.31e-05, 7.3e-06, 3.08e-05, 7.1e-05, 6.4e-06, 1.89e-05, 5.7e-06, 2.23e-05, 2.63e-05, 1.34e-05, 1.3e-05, 1.42e-05, 2.2e-05, 6.4e-06, 8.4e-06, 4.8e-06, 8.2e-06, 1.13e-05, 1.02e-05, 5.9e-06, 4.6e-05, 1.03e-05]


In [88]:
# x = CTE data 
coefTE_train = data_CTE[:40]    #78 % train size of data
coefTE_test = data_CTE[-11:]     #22 % of the test size 

coefTE_train = np.array(coefTE_train).reshape(-1,1)
coefTE_test = np.array(coefTE_test).reshape(-1,1)

# y = young's modulus
young_train = data_youngs_modulus[:40]    #dividing the data based on 78%train size
young_test = data_youngs_modulus[-11:]     #and 22% test size
# y = young's modulus
young_train = np.array(young_train).reshape(-1,1)
young_test = np.array(young_test).reshape(-1,1)  #creating array columnwise

In [89]:
predictions = regression(coefTE_train, coefTE_test, young_train, young_test) 

plot(coefTE_train, coefTE_test, young_train, young_test, "Coefficient of linear Thermal expansion", "Young's Modulus (GPa)", predictions) 
# A low R2 score signify a low goodness of fit for the data and non linear relatiosnhip between the variables

Linear Equation: -3.6852e+06 X + (1.7094e+02)
Mean squared error: 1.3160e+04
Variance score: 0.1659


## Now for the multi linear regression 
Creating and pandas dataframe and then applying linear regression to the dataframe such that:

$$y = \beta_0 + \beta_1{x_1} +\beta_2{x_2}+\beta_3{x_3}+\beta_4{x_4}+\beta_5{x_5}$$

In [101]:
#creating a dataframe
import pandas as pd
# inputing all variables of x and y
df = pd.DataFrame([data_youngs_modulus,data_specific_heat,data_melting_point,data_atomic_mass,data_CTE,data_lattice_constant])

In [117]:
df1 = df.T
columns =('Youngs_modulus','Specific_heat','Melting_point','atomic_mass','CTE','lattice_constant')
# transposing the dataframe

In [118]:
df1.columns = columns
# adding column name to the dataframe
df1.head()

Unnamed: 0,Youngs_modulus,Specific_heat,Melting_point,atomic_mass,CTE,lattice_constant
0,68.0,0.281,2125.0,91.224,6e-06,3.23
1,50.0,0.18,1350.0,150.36,1.3e-05,9.0
2,105.0,0.268,2741.0,92.90638,7e-06,3.3
3,47.0,0.703,1683.0,28.0855,3e-06,5.43
4,55.0,0.23,1586.0,157.25,9e-06,3.64


In [133]:
X = df1.iloc[:,1:] #choosing X values
y = df1.iloc[:,0]   # choosing y_values


In [136]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state =42)
#test and train split with 45 variables in train and 6 in test 

In [138]:
# model1 to avoid confusion between previous model
model1 = linear_model.LinearRegression()
model1.fit(X_train,y_train)

LinearRegression()

In [140]:
# predicted train set
predict_train = model1.predict(X_train)
predict_test = model1.predict(X_test)  #predicted test set

full_x = np.concatenate((X_train, X_test), axis=0)   #concatenated train and test values of x
full_y = np.concatenate((y_train, y_test), axis=0)    # concatenated train and test values of y
prediction = model1.predict(full_x)           #overall prediction
 

In [142]:
# for the trained predictions
print('Train predictions are \n', predict_train)
print('\n')
print('Test predictions are \n', predict_test)
print('\n')

Train predictions are 
 [ 14.8872117  -18.42374249 140.61516012 163.01798383  63.16777675
 161.69004416 -34.73478126 127.8391217   40.29975986 103.22138547
 272.2576221  -14.97129854 -12.21263459 140.138454   113.85401625
 121.320216    48.82198662  31.09522005 108.49307979 146.18614613
 114.36478289 183.97533678  80.30304415 130.60504393 145.29826678
 193.83227    201.7949397   40.62748946  61.9550802  219.8234251
 256.69499744 147.57613028 237.22699359  60.0649248  204.26938448
  42.72102655 146.6607385  293.51321063  70.45093899  64.53190157
  35.43765027 340.35999799 196.208563   107.03274139  46.0083939 ]


Test predictions are 
 [ 82.17389073 176.3408813  150.15049689 130.01471422 158.07339914
 315.4095882 ]




In [143]:
#Overall prediction of data 
print('Combined prediction are \n', prediction)

Combined prediction are 
 [ 14.8872117  -18.42374249 140.61516012 163.01798383  63.16777675
 161.69004416 -34.73478126 127.8391217   40.29975986 103.22138547
 272.2576221  -14.97129854 -12.21263459 140.138454   113.85401625
 121.320216    48.82198662  31.09522005 108.49307979 146.18614613
 114.36478289 183.97533678  80.30304415 130.60504393 145.29826678
 193.83227    201.7949397   40.62748946  61.9550802  219.8234251
 256.69499744 147.57613028 237.22699359  60.0649248  204.26938448
  42.72102655 146.6607385  293.51321063  70.45093899  64.53190157
  35.43765027 340.35999799 196.208563   107.03274139  46.0083939
  82.17389073 176.3408813  150.15049689 130.01471422 158.07339914
 315.4095882 ]


In [153]:
print('Actual values of y: \n',full_y)

Actual values of y: 
 [ 50.   16.   47.   74.   45.  211.   32.   70.  108.   55.  329.    8.
  50.   69.   61.   65.    4.9  55.   56.  121.  198.   68.   83.   79.
 168.  275.  128.   34.   50.  528.  105.   64.  447.   18.   78.   13.
 287.  186.   70.   78.   10.  411.  279.  130.   24.   20.  116.  200.
  74.  209.  463. ]


In [148]:
print(model1.coef_) #coefficient of x

[ 4.02160561e-01  1.17540695e-01 -2.78542463e-01  6.35807324e+05
 -3.26938710e+00]


In [150]:
print(model1.intercept_) #intercept beta0

-33.56587192878037


In [154]:
print("Mean squared error: %.4e" % (mean_squared_error(full_y, prediction)))  #MSE
print('Variance score: %.4f' % r2_score(full_y, prediction))   
# only 54 % accuracy or goodness of fit to the data

Mean squared error: 7.1956e+03
Variance score: 0.5439
