In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import IsolationForest 
from sklearn import svm  
from sklearn.mixture import GaussianMixture 
from sklearn.isotonic import IsotonicRegression
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm.auto import tqdm

In [2]:
train = pd.read_csv("train_data.csv") 
test = pd.read_csv("test_data.csv")  

In [3]:
train.drop(columns={"type"}, inplace=True) 
test.drop(columns={"type"}, inplace=True) 

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, z_dim):
        super(Encoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, z_dim)
        )
    
    def forward(self, x):
        return self.encoder(x)


class Decoder(nn.Module):
    def __init__(self, z_dim, hidden_dim, output_dim):
        super(Decoder, self).__init__()
        self.decoder = nn.Sequential(
            nn.Linear(z_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.decoder(x)


class Estimation(nn.Module):
    def __init__(self, z_dim, hidden_dim, output_dim):
        super(Estimation, self).__init__()
        self.estimation = nn.Sequential(
            nn.Linear(z_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.estimation(x)


class DAGMM(nn.Module):
    def __init__(self, input_dim, hidden_dim, z_dim, n_gmm):
        super(DAGMM, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, z_dim)
        self.decoder = Decoder(z_dim, hidden_dim, input_dim)
        self.estimation = Estimation(z_dim, hidden_dim, n_gmm)

    def forward(self, x):
        z_c = self.encoder(x)
        x_hat = self.decoder(z_c)
        gamma = self.estimation(z_c)
        return x_hat, z_c, gamma


def train_dagmm(model, dataset, batch_size, epochs, learning_rate, device):
    model.to(device)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    best_loss = 9999999999999
    best_epoch = -1 
    for epoch in tqdm(range(epochs), position=0, leave=True):
        train_loss = 0 
        for batch in dataloader:
            x = batch[0].to(device)
            optimizer.zero_grad()
            x_hat, z_c, gamma = model(x)
            recon_loss = criterion(x_hat, x)
            loss = recon_loss # Add any other losses, e.g., energy-based losses 
            train_loss += loss.item() 
            loss.backward()
            optimizer.step()
        
        
        # print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
        if train_loss < best_loss: 
            best_loss = train_loss 
            best_epoch = epoch 
            torch.save(model.state_dict(), "best_dagmm.pt") 
    print(f"best loss: {best_loss} | best epoch: {best_epoch}") 

# Example usage
input_dim = 7
hidden_dim = 3
z_dim = 2
n_gmm = 2
batch_size = 32
epochs = 2000
learning_rate = 1e-3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [5]:
# Replace 'your_dataset' with your actual dataset
dagmm = DAGMM(input_dim, hidden_dim, z_dim, n_gmm)


In [6]:
scaler = StandardScaler() 
scaler.fit(train) 
train = scaler.transform(train) 

train = torch.tensor(train).float() 
train_data = TensorDataset(train) 

In [7]:
train_dagmm(dagmm, train_data, batch_size, epochs, learning_rate, device)


  0%|          | 0/2000 [00:00<?, ?it/s]

best loss: 0.13425777381053194 | best epoch: 1989


In [8]:
test = scaler.transform(test) 
test = torch.tensor(test).float() 
test_data = TensorDataset(test) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1) 

In [10]:
best_dagmm = DAGMM(input_dim, hidden_dim, z_dim, n_gmm)
checkpoint = torch.load("best_dagmm.pt") 
print(best_dagmm.load_state_dict(checkpoint)) 
best_dagmm.to(device)

criterion = nn.MSELoss() 

best_dagmm.eval() 

test_mse_scores = [] 

with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader), position=0, leave=True): 
        x = batch[0].to(device)
        x_hat, z_c, gamma = best_dagmm(x)
        recon_loss = criterion(x_hat, x) 
        test_mse_scores.append(recon_loss.item()) 

<All keys matched successfully>


0it [00:00, ?it/s]

In [16]:
gamma = 10 

def mad_score(points): 
    m = np.median(points) 
    ad = np.abs(points - m) 
    mad = np.median(ad) 
    return 0.6745 * ad / mad 

z_scores = mad_score(test_mse_scores)

In [17]:
outliers = z_scores > gamma 
outliers = outliers.astype(int) 
submission = pd.read_csv("answer_sample.csv") 
submission["label"] = outliers

In [22]:
submission.to_csv("dagmm__.csv",index=False) 

In [19]:
cnt = 0 

for i in range(len(outliers)): 
    if outliers[i] == 1: 
        cnt += 1 
        
cnt 

443

In [20]:
df = pd.read_csv("autoencoders_3_5_.csv")

In [21]:
cnt = 0

for i in range(len(df["label"].values)): 
    if df["label"].values[i] == 1: 
        cnt += 1 
        
cnt

386