In [1]:
import torch
from torch import nn
from torch.functional import F
import numpy as np
import pytorch_lightning as pl
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import xgboost

  from pandas import MultiIndex, Int64Index


In [2]:
pl.__version__

'1.5.10'

In [3]:
metadata = pd.read_csv("data/metadata.csv")
train_metadata = metadata[metadata.split == "train"]
val_metadata = metadata[metadata.split == "val"]
test_metadata = metadata[metadata.split == "test"]

In [4]:
def get_features(metadata):
    features = []
    pbar = tqdm(total=len(metadata),position=0,leave=True)
    for index, row in metadata.iterrows():
        path = row["features_path"] 
        feature = pd.read_csv(f"data/{path}")
        features.append(feature)
        pbar.update(1)
    return features

In [5]:
class EGAMSDataset(torch.utils.data.Dataset):
    """Dataset for Mass Spectrometry"""

    def __init__(self,metadata, split=None):
        """
        Args:
            metadata: metadata file specific for training
            split (str): 'val' or 'train' or 'test' 
        """
        self.features = get_features(metadata)
        self.labels = pd.read_csv(f"data/{split}_labels.csv").drop("sample_id",axis=1).to_numpy()
        

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        x = torch.Tensor(self.features[idx].to_numpy()).reshape(4,-1)
        y = torch.Tensor(self.labels[idx])
        
        return x,y

class EGAMSTestDataset(torch.utils.data.Dataset):
    """TEST Dataset for Mass Spectrometry | no labels"""

    def __init__(self,metadata):
        """
        Args:
            metadata: metadata file specific for training
        """
        self.features = get_features(metadata)
        

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        x = torch.Tensor(self.features[idx].to_numpy()).reshape(4,-1)
        return x

In [39]:
train_ds = EGAMSDataset(train_metadata, 'train')
val_ds = EGAMSDataset(val_metadata, 'val')
test_ds = EGAMSTestDataset(test_metadata)
train_loader = torch.utils.data.DataLoader(train_ds, shuffle=False, num_workers=0)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=0)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=1, shuffle=False, num_workers=0)

100%|██████████| 766/766 [00:04<00:00, 167.11it/s]
100%|██████████| 293/293 [00:01<00:00, 179.14it/s]
100%|██████████| 511/511 [00:03<00:00, 134.65it/s]


In [229]:
class CNN1DModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.LazyConv1d(64,3)
        self.amp1 = nn.AdaptiveMaxPool1d(128)
        self.bn1 = nn.LazyBatchNorm1d()
        self.conv2 = nn.LazyConv1d(128,4)
        self.amp2 = nn.AdaptiveMaxPool1d(128)
        self.bn2 = nn.LazyBatchNorm1d()
        self.conv3 = nn.LazyConv1d(64,3)
        self.flatten1 = nn.Flatten()
        self.lin1 = nn.LazyLinear(64)
        self.lin2 = nn.LazyLinear(64)
        self.linOut = nn.LazyLinear(10)
        

        
    def forward(self, x):
        
        x = self.conv1(x)
        x = F.relu(x)
        x = self.amp1(x)
        x = self.bn1(x)
        x = F.dropout(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.amp2(x)
        x = self.bn2(x)
        x = F.dropout(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = self.flatten1(x)
        x = self.lin1(x)
        x = F.relu(x)
        x = F.dropout(x)
        x = self.lin2(x)
        x = F.relu(x)
        x = self.linOut(x)
        # x = F.softmax(x
        # x = F.sigmoid(x)
        x = F.relu(x)
        return x
    

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        x = self.forward(x)
        loss = F.mse_loss(y.reshape(-1,1,10), x.reshape(-1,1,10))
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        x = self.forward(x)
        loss = F.mse_loss(y.reshape(-1,1,10), x.reshape(-1,1,10))
        self.log('val_loss', loss, prog_bar=True)
        
    def predict_step(self, x, batch_idx):
        if type(x) == type([1,2]):
            x, y = x
            return self(x).numpy()
        else:
            return self(x).numpy()


In [230]:
model = CNN1DModel()
# model = CNN1DModel().load_from_checkpoint(checkpoint_path="checkpoints/EGAMSPred-epoch=195-val_loss=0.13.ckpt")
# model = CNN1DModel().load_from_checkpoint(checkpoint_path="checkpoints/manual-1.ckpt")



In [231]:
# shape is batch,features,samples
testx = torch.Tensor([np.random.rand(1,4,20)]).reshape(1,4,-1)
model(testx)

tensor([[0.0072, 0.0330, 0.0976, 0.0000, 0.0471, 0.0000, 0.0000, 0.0000, 0.0661,
         0.0836]], grad_fn=<ReluBackward0>)

In [232]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="val_loss",
    dirpath="checkpoints/",
    filename="CNN1DV7SoftmaxDropout-{epoch:02d}-{val_loss:.2f}",
    save_top_k=3,
    mode="min",
)
# trainer = pl.Trainer(callbacks=[checkpoint_callback])
trainer = pl.Trainer()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [233]:
trainer.fit(model, train_loader, val_loader)


   | Name     | Type              | Params
------------------------------------------------
0  | conv1    | Conv1d            | 832   
1  | amp1     | AdaptiveMaxPool1d | 0     
2  | bn1      | BatchNorm1d       | 128   
3  | conv2    | Conv1d            | 32.9 K
4  | amp2     | AdaptiveMaxPool1d | 0     
5  | bn2      | BatchNorm1d       | 256   
6  | conv3    | Conv1d            | 24.6 K
7  | flatten1 | Flatten           | 0     
8  | lin1     | Linear            | 516 K 
9  | lin2     | Linear            | 4.2 K 
10 | linOut   | Linear            | 650   
------------------------------------------------
579 K     Trainable params
0         Non-trainable params
579 K     Total params
2.319     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [234]:
# trainer.save_checkpoint("checkpoints/manual-1.ckpt")


In [235]:
a = trainer.validate(model,val_loader)

Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_loss': 0.1539989709854126}
--------------------------------------------------------------------------------


In [236]:
model(train_ds[0][0].reshape(1,4,-1))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], grad_fn=<ReluBackward0>)

In [237]:
pred_val = np.array(trainer.predict(model, dataloaders=val_loader)).reshape(-1,10)
pred_test = np.array(trainer.predict(model, dataloaders=test_loader)).reshape(-1,10)

  rank_zero_warn(


Predicting: 645it [00:00, ?it/s]

Predicting: 645it [00:00, ?it/s]

In [238]:
pred_res_df = np.concatenate((pred_val,pred_test))
pred_res_df.shape

(804, 10)

In [239]:
pred_res_df = pd.DataFrame(pred_res_df, columns=['basalt','carbonate','chloride','iron_oxide','oxalate','oxychlorine','phyllosilicate','silicate','sulfate','sulfide'])
ex_sub = pd.read_csv('data/submission_format.csv').drop(['basalt','carbonate','chloride','iron_oxide','oxalate','oxychlorine','phyllosilicate','silicate','sulfate','sulfide'],axis=1)
pred = ex_sub.join(pred_res_df)

In [240]:
pred

Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,S0767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,S0768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,S0769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,S0770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
799,S1565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
800,S1566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
801,S1567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
802,S1568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [214]:
pred.to_csv("preds/3_20-1.csv",index=False)