In [86]:
import pandas as pd
import numpy as np

In [188]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [189]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import laplacian_kernel

x_columns = [i for i in train.columns if i not in list(['id','formation_energy_ev_natom','bandgap_energy_ev'])]

In [190]:
label1 = 'formation_energy_ev_natom'
label2 = 'bandgap_energy_ev'

In [191]:
train = train_df.drop(['id','formation_energy_ev_natom','bandgap_energy_ev'],axis=1)

In [192]:
goal = train_df[['formation_energy_ev_natom','bandgap_energy_ev']]

In [193]:
test = test_df.drop(['id'],axis=1)

In [194]:
df_all = pd.concat([train,test])

In [195]:
df_all.head()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
0,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017
1,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025
2,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185
3,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017
4,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893


In [196]:
df_all.shape

(3000, 11)

In [197]:
df_all_log = np.log1p(df_all)

In [198]:
df_all_log.describe()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,4.563154,4.051882,0.305032,0.253965,0.248781,2.302099,2.060738,2.53468,4.513263,4.535542,4.510452
std,1.079344,0.447522,0.195385,0.176064,0.196174,0.422446,0.247189,0.374908,0.013609,0.054381,0.374895
min,2.564949,2.397895,0.0,0.0,0.0,1.395502,1.371764,1.897995,4.427766,4.41451,3.425051
25%,3.526361,3.713572,0.145139,0.089658,0.060625,1.965391,1.921669,2.332222,4.510856,4.51087,4.510846
50%,5.273,4.394449,0.318454,0.247797,0.223144,2.354385,1.999559,2.408862,4.510877,4.510899,4.510864
75%,5.332719,4.394449,0.446287,0.384446,0.384446,2.425592,2.312337,2.732405,4.510928,4.511019,4.795778
max,5.429346,4.394449,0.693147,0.693147,0.693147,3.25476,2.423944,3.271316,4.627223,4.6744,4.796237


In [199]:
from sklearn.preprocessing import StandardScaler

def standardize(data):
    data_columns = data.columns
    data_index = data.index
    norm = StandardScaler()
    norm.fit(data)
    
    data_standard = pd.DataFrame(norm.transform(data), index=data_index)
    data_standard.columns = data_columns
#     data_standard.index = data_index
    return(data_standard,norm)

In [200]:
df_norm, norm = standardize(df_all_log)

In [201]:
df_norm.describe()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1.480667e-16,2.342201e-16,2.6386300000000003e-17,3.6202520000000004e-17,2.26495e-16,5.949408e-17,1.143086e-15,-8.308169e-16,1.963592e-14,5.01954e-15,-6.429563e-16
std,1.000167,1.000167,1.000167,1.000167,1.000167,1.000167,1.000167,1.000167,1.000167,1.000167,1.000167
min,-1.851622,-3.696492,-1.561445,-1.442704,-1.268376,-2.146425,-2.787696,-1.698528,-6.283236,-2.226014,-2.8957
25%,-0.9607376,-0.7560879,-0.8184866,-0.9333835,-0.9592896,-0.7971784,-0.5626971,-0.5401124,-0.1768527,-0.4537541,0.00105025
50%,0.6577729,0.7656032,0.06870422,-0.03503973,-0.1307098,0.1237898,-0.2475396,-0.3356532,-0.1753183,-0.4532286,0.001097891
75%,0.7131113,0.7656032,0.723077,0.7412217,0.6916665,0.2923775,1.01801,0.5274819,-0.1716036,-0.4510259,0.7612092
max,0.8026499,0.7656032,1.986742,2.494866,2.265537,2.255484,1.469587,1.965172,8.375048,2.553855,0.7624327


In [202]:
df_norm.isnull().sum()

spacegroup                    0
number_of_total_atoms         0
percent_atom_al               0
percent_atom_ga               0
percent_atom_in               0
lattice_vector_1_ang          0
lattice_vector_2_ang          0
lattice_vector_3_ang          0
lattice_angle_alpha_degree    0
lattice_angle_beta_degree     0
lattice_angle_gamma_degree    0
dtype: int64

In [203]:
len(x_columns)
df_norm.shape

(3000, 11)

In [220]:
x = df_norm[:len(train_df)]
y = train_df[[label1,label2]]

In [271]:
x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size=0.05, random_state=123)

In [272]:
x_train = x_train.as_matrix()
x_valid = x_valid.as_matrix()

In [273]:
y_train_values1 = np.log1p(y_train['formation_energy_ev_natom'].values)
y_train_values2 = np.log1p(y_train['bandgap_energy_ev'].values)
y_valid_values1 = np.log1p(y_valid['formation_energy_ev_natom'].values)
y_valid_values2 = np.log1p(y_valid['bandgap_energy_ev'].values)

In [274]:
clf3 = KernelRidge(kernel ='polynomial', alpha=1.0)
clf4 = KernelRidge(kernel ='polynomial', alpha=1.0)

clf3.fit(x_train,y_train_values1)
clf4.fit(x_train,y_train_values2)

preds1 = clf3.predict(x_valid)
preds2 = clf4.predict(x_valid)

y_pred1 = np.exp(preds1)-1
y_pred2 = np.exp(preds2)-1

rmse_valid1 = np.sqrt(mean_squared_error(y_valid_values1,preds1))
rmse_valid2 = np.sqrt(mean_squared_error(y_valid_values2,preds2))

rmse_total_poly = np.sqrt(rmse_valid1**2+rmse_valid2**2)
print('RSME for formation energy:')
print(rmse_valid1)
print('RSME for band gap:')
print(rmse_valid2)
print('RSME for total:')
print(rmse_total_poly)

RSME for formation energy:
0.0340060932366
RSME for band gap:
0.0966303688861
RSME for total:
0.102439458063


In [275]:
rmse_valid1

0.03400609323655706

In [276]:
clf5 = KernelRidge(kernel='rbf',alpha=1.0)
clf6 = KernelRidge(kernel='rbf',alpha=1.0)

clf5.fit(x_train,y_train_values1)
clf6.fit(x_train,y_train_values2)

preds5 = clf5.predict(x_valid)
preds6 = clf6.predict(x_valid)

y_pred5 = np.expm1(preds5)
y_pred6 = np.expm1(preds6)

rmse_valid5 = np.sqrt(mean_squared_error(y_valid_values1,preds5))
rmse_valid6 = np.sqrt(mean_squared_error(y_valid_values2,preds6))

rmse_total_rbf = np.sqrt(rmse_valid5**2+rmse_valid6**2)
print('RMSE for formation energy')
print(rmse_valid5)
print{'RMSE for band gap'}
print(rmse_valid6)
print('RMSE for total')
print(rmse_total_rbf)

RMSE for formation energy
0.0350110193175
set(['RMSE for band gap'])
0.0956085845593
RMSE for total
0.101817350757


    RMSE for formation energy
    0.0574131968921
    set(['RMSE for band gap'])
    0.112741461516
    RMSE for total
    0.126518426809

In [277]:
clf7 = KernelRidge(kernel='laplacian',alpha=1.0)
clf8 = KernelRidge(kernel='laplacian',alpha=1.0)

clf7.fit(x_train,y_train_values1)
clf8.fit(x_train,y_train_values2)

pred7 = clf7.predict(x_valid)
pred8 = clf8.predict(x_valid)

y_pred7 = np.expm1(pred7)
y_pred8 = np.expm1(pred8)

rmse_valid7 = np.sqrt(mean_squared_error(y_valid_values1,pred7))
rmse_valid8 = np.sqrt(mean_squared_error(y_valid_values2,pred8))

rmse_total_lap = np.sqrt(rmse_valid7**2+rmse_valid8**2)

print('RMSE for formation energy')
print(rmse_valid7)
print('RMSE for band gap')
print(rmse_valid8)
print('RMSE for total')
print(rmse_total_lap)

RMSE for formation energy
0.0324527259723
RMSE for band gap
0.0882929792131
RMSE for total
0.0940682178068


In [278]:
compare_rmse = pd.DataFrame()
compare_rmse['RMSE total'] = [rmse_total_poly,rmse_total_rbf,rmse_total_lap]
compare_rmse['RMSE formation Energy'] = [rmse_valid1,rmse_valid5,rmse_valid7]
compare_rmse['RMSE bandgap'] = [rmse_valid2,rmse_valid6,rmse_valid8]

In [279]:
cobain = compare_rmse

In [280]:
cobain.index = ['Polynomial','RBF','Laplace']

In [281]:
cobain

Unnamed: 0,RMSE total,RMSE formation Energy,RMSE bandgap
Polynomial,0.102439,0.034006,0.09663
RBF,0.101817,0.035011,0.095609
Laplace,0.094068,0.032453,0.088293


In [282]:
print('poly')
print('rbf')
print('lap')
compare_rmse

poly
rbf
lap


Unnamed: 0,RMSE total,RMSE formation Energy,RMSE bandgap
Polynomial,0.102439,0.034006,0.09663
RBF,0.101817,0.035011,0.095609
Laplace,0.094068,0.032453,0.088293


In [283]:
test.isnull().sum()

spacegroup                    0
number_of_total_atoms         0
percent_atom_al               0
percent_atom_ga               0
percent_atom_in               0
lattice_vector_1_ang          0
lattice_vector_2_ang          0
lattice_vector_3_ang          0
lattice_angle_alpha_degree    0
lattice_angle_beta_degree     0
lattice_angle_gamma_degree    0
dtype: int64

In [284]:
# preds1

In [285]:
X_test = df_norm[len(train):]
# X_test = X_test.as_matrix()

preds1 = clf7.predict(X_test)
preds2 = clf8.predict(X_test)
y_pred1 = np.expm1(preds1)
y_pred2 = np.expm1(preds2)

# preds1 = clf3.predict(x_valid)
# preds2 = clf4.predict(x_valid)

# y_pred1 = np.exp(preds1)-1
# y_pred2 = np.exp(preds2)-1


krr = pd.DataFrame()
krr['id'] = test_df['id']
krr['formation_energy_ev_natom'] = y_pred1
krr['bandgap_energy_ev'] = y_pred2
krr.to_csv("krr_sub.csv", index=False)

In [286]:
krr.head()#.sort_values(by='formation_energy_ev_natom',ascending=True).head()

Unnamed: 0,id,formation_energy_ev_natom,bandgap_energy_ev
0,1,0.197683,1.582386
1,2,0.06372,3.873777
2,3,0.153768,3.430592
3,4,0.026269,2.989648
4,5,0.131918,1.571093


In [287]:
# krr.sort_values(by='formation_energy_ev_natom',ascending=True)

In [288]:
# np.log1p(0.0).values()

In [289]:
coba = pd.DataFrame({'value1':y_train_values1,'value2':y_train_values2}).sort_values(by='value1')#.head()

In [290]:
coba.head()

Unnamed: 0,value1,value2
98,0.0,1.097245
2272,0.0,0.650553
1945,0.0007,0.850407
1198,0.001,1.138602
1236,0.001399,1.188636


In [291]:
coba.shape

(2280, 2)

In [292]:
np.log1p(0.9166),np.log1p(1.9959)

(0.65055278292752472, 1.0972446872608046)

In [293]:
train_df.sort_values(by='formation_energy_ev_natom').head()

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
96,97,206,80.0,0.0,0.0,1.0,10.2904,10.2903,10.2907,90.0004,90.0004,89.9994,0.0,0.9166
1943,1944,12,20.0,0.0,1.0,0.0,12.4608,3.0845,5.8773,90.0001,103.6591,90.0001,0.0,1.9959
1647,1648,12,80.0,0.0,0.5625,0.4375,24.8062,6.4139,6.3254,90.0002,104.8531,90.0001,0.0007,1.3406
1967,1968,12,80.0,0.125,0.875,0.0,24.7917,6.1335,5.8516,90.0,103.7028,90.0001,0.001,2.1224
741,742,12,80.0,0.25,0.75,0.0,24.6618,6.098,5.826,89.9998,103.7468,90.0001,0.0014,2.2826


## Another Model

In [294]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [295]:
from sklearn.model_selection import RandomizedSearchCV

In [296]:
decTree_param1 = {'max_depth':np.arange(2,9,1)}

randomdecTree1 = RandomizedSearchCV(DecisionTreeRegressor(random_state=123),param_distributions=decTree_param1,
                                  n_iter=5,cv=5)

decTree_param2 = {'max_depth':np.arange(2,9,1)}

randomdecTree2 = RandomizedSearchCV(DecisionTreeRegressor(random_state=123),
                                   param_distributions=decTree_param2,
                                   n_iter=5,cv=5)

In [297]:
randomdecTree1.fit(x_train,y_train_values1)
randomdecTree2.fit(x_train,y_train_values2)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=123,
           splitter='best'),
          fit_params={}, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'max_depth': array([2, 3, 4, 5, 6, 7, 8])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [298]:
pred_dectree1 = randomdecTree1.predict(x_valid)
pred_dectree2 = randomdecTree2.predict(x_valid)

In [299]:
y_valid_dectree1 = np.expm1(pred_dectree1)
y_valid_dectree2 = np.expm1(pred_dectree2)

In [300]:
rmse_dectree1 = np.sqrt(mean_squared_error(y_valid_values1,pred_dectree1))
rmse_dectree2 = np.sqrt(mean_squared_error(y_valid_values2,pred_dectree2))

In [301]:
rmse_dectree_total = np.sqrt(rmse_dectree1**2+rmse_dectree2**2)

In [302]:
print('RMSE formation',rmse_dectree1)
print('RMSE bandgap',rmse_dectree2)
print('RMSE total dectree',rmse_dectree_total)

('RMSE formation', 0.033941420052817278)
('RMSE bandgap', 0.11547291104710003)
('RMSE total dectree', 0.1203578546705335)


In [303]:
new = [rmse_dectree1,rmse_dectree2,rmse_dectree_total]

In [304]:
compare_rmse = pd.DataFrame()
compare_rmse['RMSE total'] = [rmse_total_poly,rmse_total_rbf,rmse_total_lap,rmse_dectree_total]
compare_rmse['RMSE formation Energy'] = [rmse_valid1,rmse_valid5,rmse_valid7,rmse_dectree1]
compare_rmse['RMSE bandgap'] = [rmse_valid2,rmse_valid6,rmse_valid8,rmse_dectree2]

In [305]:
compare_rmse

Unnamed: 0,RMSE total,RMSE formation Energy,RMSE bandgap
0,0.102439,0.034006,0.09663
1,0.101817,0.035011,0.095609
2,0.094068,0.032453,0.088293
3,0.120358,0.033941,0.115473


In [306]:
from sklearn.ensemble import RandomForestRegressor

In [307]:
randomfor1 = RandomForestRegressor(random_state=123)
randomfor2 = RandomForestRegressor(random_state=123)

In [308]:
hyperparam = {'n_estimators':[100,300,500,1000],
             'min_samples_leaf':[2,5,8],
             'min_samples_split':[2,5,8,10]}

grid_randFor1 = RandomizedSearchCV(randomfor1,
                                  param_distributions=hyperparam,
                                  cv=5,
                                  scoring='neg_mean_squared_error')

grid_randFor2 = RandomizedSearchCV(randomfor2,
                                  param_distributions=hyperparam,
                                  cv=5,
                                  scoring='neg_mean_squared_error')

In [309]:
grid_randFor1.fit(x_train,y_train_values1)
grid_randFor2.fit(x_train,y_train_values2)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=123,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [100, 300, 500, 1000], 'min_samples_split': [2, 5, 8, 10], 'min_samples_leaf': [2, 5, 8]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=0)

In [310]:
pred_randFor1 = grid_randFor1.predict(x_valid)
pred_randFor2 = grid_randFor2.predict(x_valid)

In [311]:
y_valid_randfor1 = np.expm1(pred_randFor1)
y_valid_randfor2 = np.expm1(pred_randFor2)

In [312]:
rmse_randfor1 = np.sqrt(mean_squared_error(y_valid_values1, pred_randFor1))
rmse_randfor2 = np.sqrt(mean_squared_error(y_valid_values2, pred_randFor2))

In [313]:
rmse_randfor_total = np.sqrt(rmse_randfor1**2+rmse_randfor2**2)

In [314]:
print('rmse formation',rmse_randfor1)
print('rmse bandgap',rmse_randfor2)
print('rmse total randfor',rmse_randfor_total)

('rmse formation', 0.030635050483252359)
('rmse bandgap', 0.098857526104662322)
('rmse total randfor', 0.10349549161990301)


In [315]:
compare_rmse = pd.DataFrame()
compare_rmse['RMSE total'] = [rmse_total_poly,rmse_total_rbf,rmse_total_lap,rmse_dectree_total,rmse_randfor_total]
compare_rmse['RMSE formation Energy'] = [rmse_valid1,rmse_valid5,rmse_valid7,rmse_dectree1,rmse_randfor1]
compare_rmse['RMSE bandgap'] = [rmse_valid2,rmse_valid6,rmse_valid8,rmse_dectree2,rmse_randfor2]

In [316]:
compare_rmse

Unnamed: 0,RMSE total,RMSE formation Energy,RMSE bandgap
0,0.102439,0.034006,0.09663
1,0.101817,0.035011,0.095609
2,0.094068,0.032453,0.088293
3,0.120358,0.033941,0.115473
4,0.103495,0.030635,0.098858


In [317]:
compare_rmse.index = ['Ridge Polynomial','Ridge rbf','Ridge Laplace','Decision Tree','Random Forest']

In [318]:
compare_rmse

Unnamed: 0,RMSE total,RMSE formation Energy,RMSE bandgap
Ridge Polynomial,0.102439,0.034006,0.09663
Ridge rbf,0.101817,0.035011,0.095609
Ridge Laplace,0.094068,0.032453,0.088293
Decision Tree,0.120358,0.033941,0.115473
Random Forest,0.103495,0.030635,0.098858


In [319]:
X_test = df_norm[len(train):]
X_test = X_test.as_matrix()

preds1 = grid_randFor1.predict(X_test)
preds2 = grid_randFor2.predict(X_test)
y_pred1 = np.expm1(preds1)
y_pred2 = np.expm1(preds2)

krr = pd.DataFrame()
krr['id'] = test_df['id']
krr['formation_energy_ev_natom'] = y_pred1
krr['bandgap_energy_ev'] = y_pred2
krr.to_csv("Random Forest Normalize.csv", index=False)

In [320]:
# 34 poly
# 56 rbf
# 78 laplace
## 0.0608
X_test = df_norm[len(train):]
X_test = X_test.as_matrix()

preds1 = grid_randFor1.predict(X_test) # random forest
preds2 = clf8.predict(X_test) # laplae
y_pred1 = np.expm1(preds1)
y_pred2 = np.expm1(preds2)

krr = pd.DataFrame()
krr['id'] = test_df['id']
krr['formation_energy_ev_natom'] = y_pred1
krr['bandgap_energy_ev'] = y_pred2
krr.to_csv("Random and Laplace 5.csv", index=False)

In [213]:
# 34 poly
# 56 rbf
# 78 laplace
## 0.0608
X_test = test[x_columns]
X_test = X_test.as_matrix()

preds1 = clf3.predict(X_test) # random forest
preds2 = randomdecTree2.predict(X_test) # laplae
y_pred1 = np.expm1(preds1)
y_pred2 = np.expm1(preds2)

krr = pd.DataFrame()
krr['id'] = test['id']
krr['formation_energy_ev_natom'] = y_pred1
krr['bandgap_energy_ev'] = y_pred2
krr.to_csv("ploy dectree.csv", index=False)