# SCARF: Self-Supervised Contrastive Learning using Random Feature Corruption

![image](SCARF_Schema.png)

This file contains an implementation of SCARF on a publicly available [Darwin dataset](https://archive.ics.uci.edu/dataset/732/darwin). 
The dataset contains 451 features across 174 participants with 2 classes: Alzheimer's disease patients or healthy.
The goal is to  learn a tabular data representation in an unsupervised or self-supervised such that the learnt representations can be good at the classification task.

Adapted from [pytorch implementation of SCARF](https://github.com/clabrugere/pytorch-scarf/tree/master).

In [48]:
# importing packages
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
import os
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.metrics import (ConfusionMatrixDisplay, classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
from scarf import scarf_model, loss

In [9]:
# check the current directory, set it as scarf
if os.getcwd()!= '/home/trips/ContrastiveLearning_Tutorial/scarf':  # replace the path with your local path
    os.chdir('/home/trips/ContrastiveLearning_Tutorial/scarf')
print("Current working directory is : ", os.getcwd())

Current working directory is :  /home/trips/ContrastiveLearning_Tutorial/scarf


In [38]:
# function for initial loading and processing of the dataset
def load_dataset(dataset):
    data_file = os.path.join('../Small_datasets', dataset, dataset + ".csv")
    df = pd.read_csv(data_file)
    
    # convert the target into numeric
    df.loc[df['class']=='P','class']=1
    df.loc[df['class']=='H','class']=0
    
    data, target = df.iloc[:,1:], df.iloc[:,-1]  # first column is the id and last column is the target


    train_data, test_data, train_target, test_target = train_test_split(
        data, 
        target, 
        test_size=0.2, 
        stratify=target, 
        random_state=seed
    )

    # preprocess
    constant_cols = [c for c in train_data.columns if train_data[c].nunique() == 1]
    train_data.drop(columns=constant_cols, inplace=True)
    test_data.drop(columns=constant_cols, inplace=True)

    scaler = StandardScaler()
    train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)
    test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)

    # to torch dataset
    train_ds = scarf_model.ExampleDataset(
        train_data.to_numpy(), 
        train_target.to_numpy(), 
        columns=train_data.columns
    )
    test_ds = scarf_model.ExampleDataset(
        test_data.to_numpy(), 
        test_data.to_numpy(), 
        columns=test_data.columns
    )

    print(f"Train set: {train_ds.shape}")
    print(f"Test set: {test_ds.shape}")
    train_ds.to_dataframe().head()

    return train_ds, train_target, test_ds, test_target
    

In [45]:
# initial setting and the dataset choice

dataset = "Darwin" # dataset name
batch_size=128
epochs = 100
lr=0.001  # learning rate
repr_dims=16  # representation dimension
seed = 100  # random seed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
corruptionRate = 0.6

In [46]:
# loading data
train_data, train_labels, test_data, test_labels = load_dataset(dataset)

# data dimensions 
print(" Training data size :  ", train_data.shape)  # Number of samples * Length of the series * number of features
print(" Test data size :  ", test_data.shape)

Train set: (139, 451)
Test set: (35, 451)
 Training data size :   (139, 451)
 Test data size :   (35, 451)


In [49]:
# Setting up the data laoder and initializing the models
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

model = scarf_model.SCARF(
    input_dim=train_data.shape[1], 
    emb_dim=repr_dims,
    corruption_rate=corruptionRate,
).to(device)
optimizer = Adam(model.parameters(), lr=lr)
ntxent_loss = NTXent()

NameError: name 'NTXent' is not defined