In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read data from csv
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
# df_train.head()

# Data Statistics

In [None]:
# data shape
print('Training data: ', df_train.shape)
print('Testing data: ', df_test.shape)

In [None]:
df_train.dtypes

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
# show column names
print(df_train.columns)

In [None]:
# rename columns
def rename_df(df):
    """
    Args:
        df - Original DataFrame
    Returns:
        DataFrame with renamed columns (personal preference)
    """
    df = df.rename(columns={'spacegroup' : 'sg',
                            'number_of_total_atoms' : 'Natoms',
                            'percent_atom_al' : 'x_Al',
                            'percent_atom_ga' : 'x_Ga',
                            'percent_atom_in' : 'x_In',
                            'lattice_vector_1_ang' : 'a',
                            'lattice_vector_2_ang' : 'b',
                            'lattice_vector_3_ang' : 'c',
                            'lattice_angle_alpha_degree' : 'alpha',
                            'lattice_angle_beta_degree' : 'beta',
                            'lattice_angle_gamma_degree' : 'gamma',
                            'formation_energy_ev_natom' : 'Ef',
                            'bandgap_energy_ev' : 'Eg'})
    return df

df_train = rename_df(df_train)
df_test = rename_df(df_test)
df_train.head()

# Data Visualization 

In [None]:
# count data for each spacegroup
def plot_sg(df):
    sg_counts = df['sg'].value_counts()
    plt.figure(figsize=(8,6))
    ax = sns.barplot(sg_counts.index, sg_counts.values, alpha=0.8)
    plt.title('Counts per space group', fontsize=14)
    plt.xlabel('Space group', fontsize=12)
    plt.ylabel('Counts', fontsize=12)
    for rect, label in zip(ax.patches, sg_counts.values):
        ax.text(rect.get_x() + rect.get_width()/2, rect.get_height()*1.01, label, ha='center', va='bottom')
    plt.show()

In [None]:
plot_sg(df_train)

In [None]:
plot_sg(df_test)

In [None]:
# correlation plot
def plot_corr(df):
    corr = df.corr()
    plt.figure(figsize=(10,8))
    ax = sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), 
                     cmap=sns.diverging_palette(220,10,as_cmap=True),
                     square=True, annot=True, fmt=".2f")
    plt.show()

In [None]:
plot_corr(df_train)

## Feature Engineering

Encode features

In [None]:
# encode categorical features
df_train = pd.get_dummies(df_train, columns=['sg'])
df_test = pd.get_dummies(df_test, columns=['sg'])

Add features

In [None]:
# read xyz files
def get_xyz(filename):
    pos = []   # atomic positions
    lat = []   # lattice constant
    with open(filename) as f:
        for line in f.readlines():
            l = line.split()
            if l[0] == 'atom':
                pos.append([np.array(l[1:4], dtype=np.float),l[4]])
            elif l[0] == 'lattice_vector':
                lat.append(np.array(l[1:4], dtype=np.float))
    return pos, np.array(lat)

In [None]:
# get coordinate numbers of each M atom
def get_coord(df_name, max_bond_len):
    df = eval('df_' + df_name)
    list_coords = []
    list_bonds = []
    for i in df['id']:
        fn = "../input/" + df_name + "/{}/geometry.xyz".format(i)
        pos, lat = get_xyz(fn)
        
        # get M, O coordinates
        xyz_M = []
        xyz_O = []
        for atom in pos:
            if atom[1] == 'O':
                xyz_O.append(atom[0])
            else:
                xyz_M.append(atom[0])
        
        # calculate coordinate numbers
        coord_M = []
        bond_MO = []
        for atom_M in xyz_M:
            coord = 0
            for atom_O in xyz_O:
                d = np.linalg.norm(atom_M - atom_O)   # distance between M-O
                if d < max_bond_len:
                    coord = coord + 1
                    bond_MO.append(atom_O - atom_M)
            coord_M.append(coord)
        unique, counts = np.unique(np.array(coord_M), return_counts=True)
        list_coords.append(dict(zip(('MO' + str(n) for n in unique), counts)))
        list_bonds.append(np.mean(np.array(bond_MO), axis=0))
        
    df_coord = pd.DataFrame(list_coords).fillna(0).astype(int)
    df_bond = pd.DataFrame(list_bonds, columns=['bond_x', 'bond_y', 'bond_z'])
        
    return df_coord, df_bond

In [None]:
MAX_BOND_LEN = 2.5   # max bond length
df_coord_train, df_bond_train = get_coord('train', MAX_BOND_LEN)
df_coord_test, df_bond_test = get_coord('test', MAX_BOND_LEN)

df_train = pd.concat([df_train, df_coord_train, df_bond_train], axis=1)
df_test = pd.concat([df_test, df_coord_test, df_bond_test], axis=1)

In [None]:
# compute cell volume
def get_vol(df):
    a = df['a']
    b = df['b']
    c = df['c']
    alpha = df['alpha']
    beta = df['beta']
    gamma = df['gamma']
    alpha = np.pi*alpha/180
    beta = np.pi*beta/180
    gamma = np.pi*gamma/180
    vol = a*b*c*np.sqrt(1 + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma)
                           - np.cos(alpha)**2
                           - np.cos(beta)**2
                           - np.cos(gamma)**2)
    return vol

In [None]:
# compute the atomic density
df_train['density'] = df_train['Natoms']/get_vol(df_train)
df_test['density'] = df_test['Natoms']/get_vol(df_test)

## Feature Selection

In [None]:
print(df_train.columns)

In [None]:
select_features = ['Natoms', 'x_Al', 'x_Ga', 'x_In', 'a', 'b', 'c', 'alpha', 'beta',
       'gamma', 'MO1', 'MO2', 'MO3', 'MO4', 'MO5', 'MO6', 'bond_x',
       'bond_y', 'bond_z', 'density', 'sg_12', 'sg_33', 'sg_167', 'sg_194',
       'sg_206', 'sg_227']

## Data Preparation

In [None]:
# scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

df_combined = pd.concat([df_train[select_features], df_test[select_features]])

scaler = StandardScaler()
scaler.fit(df_combined)
df_train[select_features] = scaler.transform(df_train[select_features])
df_test[select_features] = scaler.transform(df_test[select_features])

In [None]:
X = df_train[select_features]
y = df_train[['Ef', 'Eg']] 

X_test = df_test.drop(['id'], axis = 1)

# train test split
from sklearn.cross_validation import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=123)

y_train_log = y_train.apply(np.log1p)
y_valid_log = y_valid.apply(np.log1p)

## Train Models

In [None]:
# metrics
def rmse(h, y):
    return np.sqrt(np.square(h - y).mean())

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Linear Regression
def runLR(X_train, X_valid, y_train, y_valid):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# Ridge
def runRidge(X_train, X_valid, y_train, y_valid, alpha):
    model = Ridge(alpha, random_state=111)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# Lasso
def runLasso(X_train, X_valid, y_train, y_valid, alpha):
    model = Lasso(alpha, random_state=111)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# k-NN
def runKNN(X_train, X_valid, y_train, y_valid, k):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# SVM
def runSVR(X_train, X_valid, y_train, y_valid, C):
    model = SVR(C=C)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# Random Forest
def runRF(X_train, X_valid, y_train, y_valid, n):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# XGBoosting
def runXGB(X_train, X_valid, y_train, y_valid):
    model = XGBRegressor(max_depth=7, 
                         learning_rate=0.03, 
                         n_estimator=1500,
                         objective='reg:linear', 
                         booster='gbtree', 
                         silent=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

# CatBoost
def runCB(X_train, X_valid, y_train, y_valid):
    model = CatBoostRegressor(iterations=1500, 
                                        learning_rate=0.03,
                                        depth=7, 
                                        loss_function='RMSE', 
                                        eval_metric='RMSE', 
                                        random_seed=111,
                                        od_type='Iter', 
                                        od_wait=50)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmsle = rmse(y_pred, y_valid)
    return model, y_pred, rmsle

## Tune Models

In [None]:
# Ridge
alpha_list = np.linspace(0,1,1000)
rmsle_Ef_list = []
rmsle_Eg_list = []
for alpha in alpha_list:
    _, _, rmsle_Ef = runRidge(X_train, X_valid, y_train_log.Ef, y_valid_log.Ef, alpha)
    _, _, rmsle_Eg = runRidge(X_train, X_valid, y_train_log.Eg, y_valid_log.Eg, alpha)
    rmsle_Ef_list.append(rmsle_Ef)
    rmsle_Eg_list.append(rmsle_Eg)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))
ax1.plot(alpha_list, rmsle_Ef_list)
ax2.plot(alpha_list, rmsle_Eg_list)
ax1.set_title('E_f')
ax1.set_xlabel('alpha')
ax1.set_ylabel('RMSLE')
ax2.set_title('E_g')
ax2.set_xlabel('alpha')
plt.show()

In [None]:
# Lasso
alpha_list = np.linspace(0,1,1000)
rmsle_Ef_list = []
rmsle_Eg_list = []
for alpha in alpha_list:
    _, _, rmsle_Ef = runLasso(X_train, X_valid, y_train_log.Ef, y_valid_log.Ef, alpha)
    _, _, rmsle_Eg = runLasso(X_train, X_valid, y_train_log.Eg, y_valid_log.Eg, alpha)
    rmsle_Ef_list.append(rmsle_Ef)
    rmsle_Eg_list.append(rmsle_Eg)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))
ax1.plot(alpha_list, rmsle_Ef_list)
ax2.plot(alpha_list, rmsle_Eg_list)
ax1.set_title('E_f')
ax1.set_xlabel('alpha')
ax1.set_ylabel('RMSLE')
ax2.set_title('E_g')
ax2.set_xlabel('alpha')
plt.show()

In [None]:
# KNN
k_list = range(1,11)
rmsle_Ef_list = []
rmsle_Eg_list = []
for k in k_list:
    _, _, rmsle_Ef = runKNN(X_train, X_valid, y_train_log.Ef, y_valid_log.Ef, k)
    _, _, rmsle_Eg = runKNN(X_train, X_valid, y_train_log.Eg, y_valid_log.Eg, k)
    rmsle_Ef_list.append(rmsle_Ef)
    rmsle_Eg_list.append(rmsle_Eg)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))
ax1.plot(k_list, rmsle_Ef_list)
ax2.plot(k_list, rmsle_Eg_list)
ax1.set_title('E_f')
ax1.set_xlabel('k')
ax1.set_ylabel('RMSLE')
ax2.set_title('E_g')
ax2.set_xlabel('k')
plt.show()

In [None]:
# SVM
C_list = range(1,51)
rmsle_Ef_list = []
rmsle_Eg_list = []
for C in C_list:
    _, _, rmsle_Ef = runSVR(X_train, X_valid, y_train_log.Ef, y_valid_log.Ef, C)
    _, _, rmsle_Eg = runSVR(X_train, X_valid, y_train_log.Eg, y_valid_log.Eg, C)
    rmsle_Ef_list.append(rmsle_Ef)
    rmsle_Eg_list.append(rmsle_Eg)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))
ax1.plot(C_list, rmsle_Ef_list)
ax2.plot(C_list, rmsle_Eg_list)
ax1.set_title('E_f')
ax1.set_xlabel('C')
ax1.set_ylabel('RMSLE')
ax2.set_title('E_g')
ax2.set_xlabel('C')
plt.show()

In [None]:
# Random Forest
n_list = range(1,101)
rmsle_Ef_list = []
rmsle_Eg_list = []
for n in n_list:
    _, _, rmsle_Ef = runRF(X_train, X_valid, y_train_log.Ef, y_valid_log.Ef, n)
    _, _, rmsle_Eg = runRF(X_train, X_valid, y_train_log.Eg, y_valid_log.Eg, n)
    rmsle_Ef_list.append(rmsle_Ef)
    rmsle_Eg_list.append(rmsle_Eg)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))
ax1.plot(n_list, rmsle_Ef_list)
ax2.plot(n_list, rmsle_Eg_list)
ax1.set_title('E_f')
ax1.set_xlabel('n')
ax1.set_ylabel('RMSLE')
ax2.set_title('E_g')
ax2.set_xlabel('n')
plt.show()

## Compare Models

In [None]:
def compare_models(X_train, X_valid, y_train, y_valid, params):
    _, _, rmsle_lr = runLR(X_train, X_valid, y_train, y_valid)
    _, _, rmsle_ridge = runRidge(X_train, X_valid, y_train, y_valid, params[0])
    _, _, rmsle_lasso = runLasso(X_train, X_valid, y_train, y_valid, params[1])
    _, _, rmsle_knn = runKNN(X_train, X_valid, y_train, y_valid, params[2])
    _, _, rmsle_svr = runSVR(X_train, X_valid, y_train, y_valid, params[3])
    _, _, rmsle_rf = runRF(X_train, X_valid, y_train, y_valid, params[4])
    _, _, rmsle_xgb = runXGB(X_train, X_valid, y_train, y_valid)
    _, _, rmsle_cb = runCB(X_train, X_valid, y_train, y_valid)
    model_names = ['LR', 'Ridge', 'Lasso', 'KNN', 'SVR', 'RandomForest', 'XGBoost', 'CatBoost']
    rmsle = [rmsle_lr, rmsle_ridge, rmsle_lasso, rmsle_knn, rmsle_svr, rmsle_rf, rmsle_xgb, rmsle_cb]
    plt.figure(figsize=(10,6))
    ax = sns.barplot(model_names, rmsle, alpha=0.8)
    for rect, label in zip(ax.patches, rmsle):
        ax.text(rect.get_x() + rect.get_width()/2, rect.get_height()*1.01, round(label,5), ha='center', va='bottom')
    plt.show()

In [None]:
# models for Ef
compare_models(X_train, X_valid, y_train_log.Ef, y_valid_log.Ef, [0, 0, 5, 30, 42])

In [None]:
# models for Eg
compare_models(X_train, X_valid, y_train_log.Eg, y_valid_log.Eg, [0.6, 0, 5, 1, 34])