In [56]:
import numpy as np
import pandas as pd
from scipy.stats import zscore 
from sklearn.cluster import KMeans 
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler 
from tqdm.auto import tqdm

In [21]:
train = pd.read_csv("train_data.csv") 
test = pd.read_csv("test_data.csv") 

In [37]:
pipeline = Pipeline([("normalizer", Normalizer()), 
                     ("scaler", MinMaxScaler())]) 

In [38]:
pipeline.fit(train)

Pipeline(steps=[('normalizer', Normalizer()), ('scaler', MinMaxScaler())])

In [39]:
X_train_transformed = pipeline.transform(train) 

In [42]:
X_train_transformed.shape

(2463, 8)

In [46]:
class AutoEncoder(nn.Module): 
    def __init__(self, input_dim): 
        self.input_dim = input_dim
        super(AutoEncoder, self).__init__() 
        self.encoder = nn.Sequential(
            nn.Linear(self.input_dim, 6), 
            nn.ELU(), 
            nn.Linear(6, 4), 
            nn.ELU(), 
            nn.Linear(4,2), 
            nn.ELU()
        ) 
        self.decoder = nn.Sequential(
            nn.Linear(2, 4), 
            nn.ELU(), 
            nn.Linear(4, 6), 
            nn.ELU(), 
            nn.Linear(6, self.input_dim)
        ) 
    def forward(self, x):
        encoded = self.encoder(x) 
        decoded = self.decoder(encoded) 
        return decoded

In [48]:
model = AutoEncoder(X_train_transformed.shape[1]) 
model

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=8, out_features=6, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=6, out_features=4, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=4, out_features=2, bias=True)
    (5): ELU(alpha=1.0)
  )
  (decoder): Sequential(
    (0): Linear(in_features=2, out_features=4, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=4, out_features=6, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=6, out_features=8, bias=True)
  )
)

In [53]:
X_train_transformed = torch.tensor(X_train_transformed).float() 

X_train_transformed.shape

torch.Size([2463, 8])

In [54]:
train_data = TensorDataset(X_train_transformed) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32) 

In [74]:
device = torch.device("cuda") 
model = model.to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss() 

for epoch in tqdm(range(1, 2001), position=0, leave=True, desc="Epochs"):
    train_loss = 0 
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) 
        b_inputs = batch[0] 
        decoded = model(b_inputs)
        loss = criterion(decoded, b_inputs) 
        train_loss += loss.item() 
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step()  
    if epoch > 0 and epoch % 100 == 0: 
        print(f"epoch : {epoch} | train loss : {train_loss}") 
    

Epochs:   0%|          | 0/2000 [00:00<?, ?it/s]

epoch : 100 | train loss : 0.3221131565514952
epoch : 200 | train loss : 0.31695394427515566
epoch : 300 | train loss : 0.3018198012141511
epoch : 400 | train loss : 0.24394843191839755
epoch : 500 | train loss : 0.040167459985241294
epoch : 600 | train loss : 0.038549523713300005
epoch : 700 | train loss : 0.03713708431314444
epoch : 800 | train loss : 0.03650548389850883
epoch : 900 | train loss : 0.036080470024899114
epoch : 1000 | train loss : 0.0353011174047424
epoch : 1100 | train loss : 0.03510962744621793
epoch : 1200 | train loss : 0.035686783343408024
epoch : 1300 | train loss : 0.03437821094485116
epoch : 1400 | train loss : 0.034239507807797054
epoch : 1500 | train loss : 0.034702274129813304
epoch : 1600 | train loss : 0.03413312944758218
epoch : 1700 | train loss : 0.03415448812302202
epoch : 1800 | train loss : 0.034417207149090245
epoch : 1900 | train loss : 0.034078514167049434
epoch : 2000 | train loss : 0.03393981237968546


In [75]:
torch.save(model.state_dict(), "autoencoder_chkpt.pt")

In [82]:
X_test_transformed = pipeline.transform(test) 


In [86]:
X_test_transformed = torch.tensor(X_test_transformed).float() 

test_data = TensorDataset(X_test_transformed) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1) 

  X_test_transformed = torch.tensor(X_test_transformed).float()


In [87]:
test_mse_scores = [] 

model.eval() 

with torch.no_grad(): 
    for step, batch in tqdm(enumerate(test_dataloader), position=0, leave=True, desc="Reconstructing test dataset"): 
        batch = tuple(t.to(device) for t in batch) 
        b_inputs = batch[0] 
        decoded = model(b_inputs) 
        loss = criterion(decoded, b_inputs)
        test_mse_scores.append(loss.item()) 

Reconstructing test dataset: 0it [00:00, ?it/s]

In [91]:
# using the median absolute deviation method to define outliers 

gamma = 3 

def mad_score(points): 
    m = np.median(points)
    ad = np.abs(points - m) 
    mad = np.median(ad) 
    return 0.6745 * ad / mad 


z_scores = mad_score(test_mse_scores) 

In [93]:
outliers = z_scores > gamma 

outliers

array([False, False, False, ..., False, False, False])

In [95]:
outliers = outliers.astype(int) 
outliers

array([0, 0, 0, ..., 0, 0, 0])

In [99]:
# 정상: 0, 이상: 1 
submission = pd.read_csv("answer_sample.csv") 
submission["label"] = outliers 

In [101]:
submission.to_csv("autoencoders.csv", index=False) 