In [61]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler, PowerTransformer 
from sklearn.ensemble import IsolationForest 
from sklearn import svm  
from sklearn.mixture import GaussianMixture 
from sklearn.isotonic import IsotonicRegression
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim

In [62]:
train = pd.read_csv("train_data.csv") 
test = pd.read_csv("test_data.csv")  

In [63]:
# shuffle dataframe 
train = train.sample(frac=1, random_state=42).reset_index(drop=True) 

In [64]:
train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,3.04,60.24,0.7,39.34,3219.0,75.77,3.79,5
1,3.15,61.63,0.7,40.7,3330.0,77.0,3.85,7
2,2.12,48.04,0.7,27.41,2243.0,64.92,3.25,4
3,0.88,56.38,0.7,14.0,3150.0,72.0,3.05,2
4,2.53,53.49,0.7,32.74,2679.0,69.77,3.49,6


In [65]:
scaler = StandardScaler() 

X_continuous = train[["air_inflow", "air_end_temp", "out_pressure", "motor_current", "motor_rpm", "motor_temp", "motor_vibe"]] 
X_categorical = train["type"] 

X_continuous = scaler.fit_transform(X_continuous)  

In [66]:
X_categorical_onehot = [] 

num_classes = len(np.unique(X_categorical)) 

for i in range(len(X_categorical)): 
    arr = [0 for _ in range(8)] 
    arr[X_categorical[i]] = 1 
    X_categorical_onehot.append(arr)  

In [67]:
class AutoEncoder(nn.Module): 
    def __init__(self, input_dim, n_categories, embedding_dim): 
        super(AutoEncoder, self).__init__() 
        self.input_dim = input_dim
        self.n_categories = n_categories 
        self.embedding_dim = embedding_dim 
        self.embedding = nn.Embedding(n_categories, embedding_dim) 
        self.encoder = nn.Sequential(
            nn.Linear(self.input_dim + self.embedding_dim, 10), 
            nn.ELU(), 
            nn.Linear(10, 5)
        ) 
        self.decoder = nn.Sequential(
            nn.Linear(5, 10), 
            nn.ELU(), 
            nn.Linear(10, self.input_dim + self.embedding_dim)
        ) 
    def forward(self, x_continuous, x_categorical): 
        x_embedded = self.embedding(x_categorical) 
        x_embedded = torch.mean(x_embedded, dim=1) 
        x = torch.cat([x_continuous, x_embedded], dim=1) 
        encoded = self.encoder(x) 
        decoded = self.decoder(encoded) 
        return x, decoded

In [68]:
batch_size = 32 

X_continuous = torch.tensor(X_continuous).float() 
X_categorical_onehot = torch.tensor(X_categorical_onehot, dtype=int) 

X_categorical_onehot.shape

torch.Size([2463, 8])

In [69]:
train_data = TensorDataset(X_continuous, X_categorical_onehot) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)  

In [70]:
device = torch.device("cuda") 
model = AutoEncoder(input_dim=7, n_categories=8, embedding_dim=16) 
model = model.to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss() 
best_loss = 99999999999 
best_epoch = -1 

for epoch in tqdm(range(2000), position=0, leave=True, desc="Epochs"):
    train_loss = 0 
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) 
        b_continuous, b_categorical = batch  
        b_inputs, decoded = model(b_continuous, b_categorical)
        loss = criterion(decoded, b_inputs) 
        train_loss += loss.item() 
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step()  
    if train_loss < best_loss: 
        best_loss = train_loss 
        best_epoch = epoch 
        torch.save(model.state_dict(), "Best_AE_Categorical.pt")  
    
    avg_train_loss = train_loss / len(train_dataloader) 
    if epoch%200 == 0 and epoch > 0: 
        print(f"average train loss : {avg_train_loss}") 

print(f"best loss : {best_loss} | best epoch: {best_epoch}")

Epochs:   0%|          | 0/2000 [00:00<?, ?it/s]

average train loss : 8.129753877693037e-05
average train loss : 1.4757343416766327e-05
average train loss : 3.5069344682136496e-06
average train loss : 1.6267119408380555e-05
average train loss : 1.0293354534951504e-05
average train loss : 3.0102474338550505e-06
average train loss : 5.468058240588213e-06
average train loss : 1.1089075923183527e-05
average train loss : 2.344090823392541e-06
best loss : 0.0001380180854084756 | best epoch: 1989


In [71]:
X_test_continuous = test[["air_inflow", "air_end_temp", "out_pressure", "motor_current", "motor_rpm", "motor_temp", "motor_vibe"]] 
X_test_categorical = test["type"]

X_test_continuous = scaler.transform(X_test_continuous)  

In [72]:
X_test_categorical_onehot = [] 

for i in range(len(X_test_categorical)): 
    arr = [0 for _ in range(num_classes)] 
    arr[X_test_categorical[i]] = 1 
    X_test_categorical_onehot.append(arr)  

In [78]:
batch_size = 1 
X_test_continuous = torch.tensor(X_test_continuous).float() 
X_test_categorical_onehot = torch.tensor(X_test_categorical_onehot, dtype=int) 

X_test_continuous.shape, X_test_categorical_onehot.shape

  X_test_continuous = torch.tensor(X_test_continuous).float()
  X_test_categorical_onehot = torch.tensor(X_test_categorical_onehot, dtype=int)


(torch.Size([7389, 7]), torch.Size([7389, 8]))

In [79]:
test_data = TensorDataset(X_test_continuous, X_test_categorical_onehot) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)  

In [80]:
best_model = AutoEncoder(input_dim=7, n_categories=8, embedding_dim=16) 
checkpoint = torch.load("Best_AE_Categorical.pt") 
print(best_model.load_state_dict(checkpoint)) 
best_model.to(device)

<All keys matched successfully>


AutoEncoder(
  (embedding): Embedding(8, 16)
  (encoder): Sequential(
    (0): Linear(in_features=23, out_features=10, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=10, out_features=5, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=10, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=10, out_features=23, bias=True)
  )
)

In [81]:
criterion = nn.MSELoss() 

best_model.eval() 

test_mse_scores = [] 

with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader), position=0, leave=True): 
        batch = tuple(t.to(device) for t in batch) 
        b_continuous, b_categorical = batch  
        b_inputs, decoded = model(b_continuous, b_categorical)
        loss = criterion(decoded, b_inputs) 
        test_mse_scores.append(loss.item())  

0it [00:00, ?it/s]

In [82]:
def mad_score(points): 
    m = np.median(points) 
    ad = np.abs(points - m) 
    mad = np.median(ad) 
    return 0.6745 * ad / mad 

z_scores = mad_score(test_mse_scores)

In [93]:
gammas = [50.5, 51.5, 60.5, 70.5, 80.5, 90.5, 100.5, 110.5, 120.5, 130.5, 140.5, 150.5, 160.5, 170.5, 180.5, 190.5] 

for gamma in gammas: 
    outliers = z_scores > gamma 
    outliers = outliers.astype(int) 
    submission = pd.read_csv("answer_sample.csv") 
    submission["label"] = outliers
    
    cnt = 0 
    for i in range(len(outliers)): 
        if outliers[i] == 1: 
            cnt += 1 

    submission.to_csv(f"AE_Categorical_{gamma}_cnt_{cnt}.csv",index=False)  
    
    print(f"gamma:{gamma}, cnt:{cnt}") 


gamma:50.5, cnt:399
gamma:51.5, cnt:393
gamma:60.5, cnt:359
gamma:70.5, cnt:350
gamma:80.5, cnt:345
gamma:90.5, cnt:344
gamma:100.5, cnt:344
gamma:110.5, cnt:343
gamma:120.5, cnt:343
gamma:130.5, cnt:343
gamma:140.5, cnt:343
gamma:150.5, cnt:343
gamma:160.5, cnt:343
gamma:170.5, cnt:343
gamma:180.5, cnt:343
gamma:190.5, cnt:343
