# FoNS Datathon 2021 

In [1]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.rcParams['figure.figsize'] = (8,6)
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn import *

In [2]:
train_descriptors = pd.read_csv("train_descriptors.csv")
train_mord3d = pd.read_csv("train_mord3d.csv")
train_morgan = pd.read_csv("train_morgan.csv")
train_rdk = pd.read_csv("train_rdk.csv")

train_crystals = pd.read_csv("train_crystals.csv")
train_distances = pd.read_csv("train_distances.csv")
train_centroid_distances = pd.read_csv("train_centroid_distances.csv")

In [3]:
test_descriptors = pd.read_csv("test_descriptors.csv")
test_mord3d = pd.read_csv("test_mord3d.csv")
test_morgan = pd.read_csv("test_morgan.csv")
test_rdk = pd.read_csv("test_rdk.csv")

### Data pre-processing

In [4]:
train_descriptors_full = train_descriptors.iloc[:, 3:-2].dropna(axis= 1, how="any")
train_descriptors_full.shape

(13449, 984)

In [5]:
test_descriptors_full = test_descriptors[train_descriptors_full.columns]
test_descriptors_full.shape

(3363, 984)

In [7]:
train_PCA = decomposition.PCA(n_components=.95)
scaler_for_PCA = preprocessing.StandardScaler()
train_descriptors_PCA = train_PCA.fit_transform(scaler_for_PCA.fit_transform(train_descriptors_full))
test_descriptors_PCA = train_PCA.transform(scaler_for_PCA.transform(test_descriptors_full))
print(train_descriptors_PCA.shape, test_descriptors_PCA.shape)

(13449, 124) (3363, 124)


In [20]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f717d185770>

In [21]:
x = torch.tensor(train_descriptors_PCA).float()
target = 'cell_volume'
y = torch.tensor(train_crystals[target].values).float()

In [22]:
d_in = x.shape[1]
d_out = y.shape[0]

In [23]:
y = y.reshape(d_out,1)

In [24]:
x_test = torch.tensor(test_descriptors_full.values).float()
x_test.shape

torch.Size([3363, 984])

In [25]:
model_pca = torch.nn.Sequential(
        torch.nn.Linear(d_in,2*d_in),
        torch.nn.ReLU(),
        torch.nn.Linear(2*d_in,1),
        torch.nn.ReLU())

In [26]:
loss_fn = torch.nn.L1Loss()
learning_rate = 1e-2
optimizer = torch.optim.Adam(model_pca.parameters(),lr=learning_rate)
for t in tqdm(range(500)):
    # Forward pass: compute predicted y by passing x to the model
    y_pred = model_pca(x_pca)
    
    # compute and print loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

  0%|          | 0/500 [00:00<?, ?it/s]

99 250.53619384765625
199 235.1154327392578
299 229.72683715820312
399 226.78500366210938
499 224.87969970703125


In [27]:
learning_rate = 1e-2
optimizer = torch.optim.Adam(model_pca.parameters(),lr=learning_rate)
for t in tqdm(range(1000)):
    # Forward pass: compute predicted y by passing x to the model
    y_pred = model_pca(x_pca)
    
    # compute and print loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

  0%|          | 0/1000 [00:00<?, ?it/s]

99 223.65011596679688
199 222.5361328125
299 221.5463409423828
399 220.76123046875
499 219.95713806152344
599 219.3131866455078
699 218.7624969482422
799 218.2318572998047
899 217.77935791015625
999 217.29345703125


In [28]:
learning_rate = 1e-2
optimizer = torch.optim.Adam(model_pca.parameters(),lr=learning_rate)
for t in tqdm(range(1000)):
    # Forward pass: compute predicted y by passing x to the model
    y_pred = model_pca(x_pca)
    
    # compute and print loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

  0%|          | 0/1000 [00:00<?, ?it/s]

99 216.90133666992188
199 216.56312561035156
299 216.06517028808594
399 215.7594451904297
499 215.3126678466797
599 214.90951538085938
699 214.51931762695312
799 214.1055908203125
899 213.79388427734375
999 213.3924102783203


In [29]:
x_test_pca = torch.tensor(test_descriptors_PCA).float()
predictions_pca = model_pca(x_test_pca).flatten().detach().numpy()
np.savetxt("bonus_2_predictions.csv", predictions_pca)