In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

### Glance

In [2]:
train = pd.read_csv("train.csv")
unique_m = pd.read_csv("unique_m.csv")

In [3]:
print(train.shape)
print(unique_m.shape)

(21263, 82)
(21263, 88)


In [4]:
train.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


In [5]:
unique_m.head()

Unnamed: 0,H,He,Li,Be,B,C,N,O,F,Ne,...,Au,Hg,Tl,Pb,Bi,Po,At,Rn,critical_temp,material
0,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,29.0,Ba0.2La1.8Cu1O4
1,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,26.0,Ba0.1La1.9Ag0.1Cu0.9O4
2,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,19.0,Ba0.1La1.9Cu1O4
3,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,22.0,Ba0.15La1.85Cu1O4
4,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,23.0,Ba0.3La1.7Cu1O4


In [6]:
def plotly_barchart(x, y):
    visual_data = [go.Bar(x=x, y=y, marker=dict(color='lightseagreen'))]

    margin = dict(l=40, r=30, b=30, t=30)
    layout = go.Layout(height=250, width=400, margin=margin)

    fig = go.Figure(data=visual_data, layout=layout)
    py.iplot(fig, show_link=False)

In [7]:
counters, bins = np.histogram(train.critical_temp.tolist(),
                                  bins=np.linspace(0, 150, 30))
centers = (bins[:-1] + bins[1:]) / 2

In [8]:
plotly_barchart(centers, counters)

In [9]:
def plotly_boxplot(input_data):
    visual_data = []
    for y in input_data:
        visual_data.append(go.Box(y=input_data[y], name=y, marker = dict(color='lightseagreen')))
    
    MARGIN = dict(l=30, r=30, b=20, t=30)
    layout = go.Layout(height=250, width=600, margin=MARGIN, showlegend=False)

    fig = go.Figure(data=visual_data, layout=layout)
    py.iplot(fig, show_link=False)

In [10]:
elements = unique_m.columns[:-2].tolist()

mean_temp = []
for e in elements:
    relevant_rows = unique_m[e] > 0
    if unique_m[relevant_rows].shape[0] > 0:
        temps = unique_m[relevant_rows]['critical_temp']
        mean_temp.append([e, temps.mean(skipna=True)])

sorted_data = sorted(mean_temp, key=lambda x:x[1], reverse=True)
hot_elements = [x[0] for x in sorted_data[:10]]
cold_elements = [x[0] for x in sorted_data[-10:]]

high_temps = {}
for e in hot_elements:
    relevant_rows = unique_m[e] > 0
    high_temps[e] = unique_m[relevant_rows]['critical_temp']
    
low_temps = {}
for e in cold_elements:
    relevant_rows = unique_m[e] > 0
    low_temps[e] = unique_m[relevant_rows]['critical_temp']

In [11]:
plotly_boxplot(high_temps)

In [12]:
plotly_boxplot(low_temps)

### Feature Importance

In [13]:
X = train.iloc[:, :-1]
Y = train['critical_temp']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [14]:
regr = RandomForestRegressor(max_depth=10, random_state=0)
regr.fit(X_train, Y_train)

print("Train score: {:.3f}".format(regr.score(X_train, Y_train)))
print("Test score: {:.3f}".format(regr.score(X_test, Y_test)))

Train score: 0.927
Test score: 0.889


In [15]:
feature_importance = list(zip(train.columns.tolist(), 
                              regr.feature_importances_))
n_top = 5

sorted(feature_importance, 
       key = lambda x: x[1], 
       reverse=True)[:n_top]

[('range_ThermalConductivity', 0.585369951600641),
 ('wtd_gmean_ThermalConductivity', 0.1317450482717823),
 ('wtd_gmean_Valence', 0.02300310230403966),
 ('std_atomic_mass', 0.020774261081292),
 ('wtd_entropy_ThermalConductivity', 0.01503492701847228)]

In [16]:
X = unique_m.iloc[:, :-2]
Y = unique_m['critical_temp']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [17]:
regr = RandomForestRegressor(max_depth=10, random_state=0)
regr.fit(X_train, Y_train)

print("Train score: {:.3f}".format(regr.score(X_train, Y_train)))
print("Test score: {:.3f}".format(regr.score(X_test, Y_test)))

Train score: 0.910
Test score: 0.891


In [18]:
feature_importance = list(zip(unique_m.columns.tolist(), 
                              regr.feature_importances_))
n_top = 5
sorted(feature_importance, key = lambda x: x[1], reverse=True)[:n_top]

[('Cu', 0.6974789775415623),
 ('Ca', 0.07977553251784117),
 ('Ba', 0.06367883867861361),
 ('O', 0.04882397423647486),
 ('Pr', 0.0157306983421234)]

### Clustering

In [19]:
n_samples = 1000
X_embedded = TSNE(n_components=2). \
                fit_transform(unique_m.sample(n_samples).iloc[:, :-2])

In [20]:
def plotly_scatter(x, y, labels):
    trace = go.Scatter(
        x=x, y=y,
        mode='markers',
        text=labels,
        hoverinfo='text',
        marker=dict(opacity=0.5, symbol='cross-dot', color='lightseagreen')
    )
    MARGIN = dict(l=0, r=30, b=0, t=30)
    layout = go.Layout(height=350, width=400, hovermode='closest', margin=MARGIN)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, show_link=False)

In [21]:
x_emb = [x[0] for x in X_embedded]
y_emb = [x[1] for x in X_embedded]
labels = unique_m['material'].tolist()
plotly_scatter(x_emb, y_emb, labels)