In [8]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler, PowerTransformer 
from sklearn.ensemble import IsolationForest 
from sklearn import svm  
from sklearn.mixture import GaussianMixture 
from sklearn.isotonic import IsotonicRegression
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.decomposition import PCA 

In [24]:
train = pd.read_csv("train_data.csv") 
test = pd.read_csv("test_data.csv")  

In [25]:
# shuffle dataframe 
train = train.sample(frac=1, random_state=42).reset_index(drop=True) 

In [11]:
train["specific_power"] = train["motor_current"] / train["air_inflow"] 
train["compression_ratio"] = train["out_pressure"] / train["air_end_temp"] 
train["motor_efficiency"] = train["motor_rpm"] / train["motor_current"]  
train["temperature_difference"] = train["air_end_temp"] - train["motor_temp"] 
train["pr_ratio"] = train["out_pressure"] / train["motor_rpm"] 

In [13]:
train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,specific_power,compression_ratio,motor_efficiency,temperature_difference,pr_ratio
0,3.04,60.24,0.7,39.34,3219.0,75.77,3.79,5,12.940789,0.01162,81.825114,-15.53,0.000217
1,3.15,61.63,0.7,40.7,3330.0,77.0,3.85,7,12.920635,0.011358,81.818182,-15.37,0.00021
2,2.12,48.04,0.7,27.41,2243.0,64.92,3.25,4,12.929245,0.014571,81.831448,-16.88,0.000312
3,0.88,56.38,0.7,14.0,3150.0,72.0,3.05,2,15.909091,0.012416,225.0,-15.62,0.000222
4,2.53,53.49,0.7,32.74,2679.0,69.77,3.49,6,12.940711,0.013087,81.826512,-16.28,0.000261


In [26]:
test["specific_power"] = test["motor_current"] / test["air_inflow"] 
test["compression_ratio"] = test["out_pressure"] / test["air_end_temp"] 
test["motor_efficiency"] = test["motor_rpm"] / test["motor_current"]  
test["temperature_difference"] = test["air_end_temp"] - test["motor_temp"] 
test["pr_ratio"] = test["out_pressure"] / test["motor_rpm"] 

In [14]:
scaler = StandardScaler() 

X_continuous = train[["air_inflow", "air_end_temp", "out_pressure", "motor_current", "motor_rpm", "motor_temp", "motor_vibe", "specific_power", "compression_ratio", "motor_efficiency", "temperature_difference", "pr_ratio"]] 
X_categorical = train["type"] 

X_continuous = scaler.fit_transform(X_continuous)  

In [30]:
X_continuous.shape

torch.Size([2463, 12])

In [16]:
X_categorical_onehot = [] 

num_classes = len(np.unique(X_categorical)) 

for i in range(len(X_categorical)): 
    arr = [0 for _ in range(num_classes)] 
    arr[X_categorical[i]] = 1 
    X_categorical_onehot.append(arr)  

In [17]:
class AutoEncoder(nn.Module): 
    def __init__(self, input_dim, n_categories, embedding_dim): 
        super(AutoEncoder, self).__init__() 
        self.input_dim = input_dim
        self.n_categories = n_categories 
        self.embedding_dim = embedding_dim 
        self.embedding = nn.Embedding(n_categories, embedding_dim) 
        self.encoder = nn.Sequential(
            nn.Linear(self.input_dim + self.embedding_dim, 10), 
            nn.ELU(), 
            nn.Linear(10, 5)
        ) 
        self.decoder = nn.Sequential(
            nn.Linear(5, 10), 
            nn.ELU(), 
            nn.Linear(10, self.input_dim + self.embedding_dim)
        ) 
    def forward(self, x_continuous, x_categorical): 
        x_embedded = self.embedding(x_categorical) 
        x_embedded = torch.mean(x_embedded, dim=1) 
        x = torch.cat([x_continuous, x_embedded], dim=1) 
        encoded = self.encoder(x) 
        decoded = self.decoder(encoded) 
        return x, decoded

In [18]:
batch_size = 32 

X_continuous = torch.tensor(X_continuous).float() 
X_categorical_onehot = torch.tensor(X_categorical_onehot, dtype=int) 

X_categorical_onehot.shape

torch.Size([2463, 8])

In [19]:
train_data = TensorDataset(X_continuous, X_categorical_onehot) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)  

In [22]:
device = torch.device("cuda") 
model = AutoEncoder(input_dim=X_continuous.shape[1], n_categories=8, embedding_dim=16) 
model = model.to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss() 
best_loss = 99999999999 
best_epoch = -1 

epochs = 2000 

for epoch in tqdm(range(epochs), position=0, leave=True, desc="Epochs"):
    train_loss = 0 
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) 
        b_continuous, b_categorical = batch  
        b_inputs, decoded = model(b_continuous, b_categorical)
        loss = criterion(decoded, b_inputs) 
        train_loss += loss.item() 
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step()  
    avg_train_loss = train_loss / len(train_dataloader) 
    
    if avg_train_loss < best_loss: 
        best_loss = avg_train_loss 
        best_epoch = epoch 
        torch.save(model.state_dict(), "AE_Categorical_FE_GPT_.pt")  
    
    if epoch%200 == 0 and epoch > 0: 
        print(f"average train loss : {avg_train_loss}") 

print(f"best loss : {best_loss} | best epoch: {best_epoch}")

Epochs:   0%|          | 0/2000 [00:00<?, ?it/s]

average train loss : 0.00010685088754935468
average train loss : 4.324807496113981e-05
average train loss : 2.856063623436388e-05
average train loss : 5.5026583424194825e-05
average train loss : 2.3380810489470605e-05
average train loss : 2.0753952483502944e-05
average train loss : 0.0001237795471185221
average train loss : 2.0080460813773385e-05
average train loss : 1.5156714334216894e-05
best loss : 1.2946776548363101e-05 | best epoch: 1992


In [27]:
X_test_continuous = test[["air_inflow", "air_end_temp", "out_pressure", "motor_current", "motor_rpm", "motor_temp", "motor_vibe", "specific_power", "compression_ratio", "motor_efficiency", "temperature_difference", "pr_ratio"]] 
X_test_categorical = test["type"]

X_test_continuous = scaler.transform(X_test_continuous)  

In [28]:
X_test_categorical_onehot = [] 

for i in range(len(X_test_categorical)): 
    arr = [0 for _ in range(num_classes)] 
    arr[X_test_categorical[i]] = 1 
    X_test_categorical_onehot.append(arr)  

In [29]:
batch_size = 1 
X_test_continuous = torch.tensor(X_test_continuous).float() 
X_test_categorical_onehot = torch.tensor(X_test_categorical_onehot, dtype=int) 

X_test_continuous.shape, X_test_categorical_onehot.shape

(torch.Size([7389, 12]), torch.Size([7389, 8]))

In [31]:
test_data = TensorDataset(X_test_continuous, X_test_categorical_onehot) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)  

In [34]:
best_model = AutoEncoder(input_dim=12, n_categories=8, embedding_dim=16) 
checkpoint = torch.load("AE_Categorical_FE_GPT_.pt") 
print(best_model.load_state_dict(checkpoint)) 
best_model.to(device)

<All keys matched successfully>


AutoEncoder(
  (embedding): Embedding(8, 16)
  (encoder): Sequential(
    (0): Linear(in_features=28, out_features=10, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=10, out_features=5, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=10, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=10, out_features=28, bias=True)
  )
)

In [35]:
criterion = nn.MSELoss() 

best_model.eval() 

test_mse_scores = [] 

with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader), position=0, leave=True): 
        batch = tuple(t.to(device) for t in batch) 
        b_continuous, b_categorical = batch  
        b_inputs, decoded = model(b_continuous, b_categorical)
        loss = criterion(decoded, b_inputs) 
        test_mse_scores.append(loss.item())  

0it [00:00, ?it/s]

In [36]:
def mad_score(points): 
    m = np.median(points) 
    ad = np.abs(points - m) 
    mad = np.median(ad) 
    return 0.6745 * ad / mad 

z_scores = mad_score(test_mse_scores)

In [37]:
gammas = [3.5, 5.5, 10.5, 20.5, 30.5, 40.5, 50.5, 60.5, 70.5, 80.5, 90.5, 100.5, 110.5, 120.5, 130.5, 140.5, 150.5, 160.5, 170.5, 180.5, 190.5, 200.5] 

for gamma in gammas: 
    outliers = z_scores > gamma 
    outliers = outliers.astype(int) 
    submission = pd.read_csv("answer_sample.csv") 
    submission["label"] = outliers
    
    cnt = 0 
    for i in range(len(outliers)): 
        if outliers[i] == 1: 
            cnt += 1 
    
    if gamma == 200.5:
        submission.to_csv(f"GPT_AE_Categorical_{gamma}_cnt_{cnt}.csv",index=False)  
    
    print(f"gamma:{gamma}, cnt:{cnt}") 


gamma:3.5, cnt:758
gamma:5.5, cnt:523
gamma:10.5, cnt:386
gamma:20.5, cnt:351
gamma:30.5, cnt:345
gamma:40.5, cnt:344
gamma:50.5, cnt:344
gamma:60.5, cnt:344
gamma:70.5, cnt:344
gamma:80.5, cnt:344
gamma:90.5, cnt:344
gamma:100.5, cnt:344
gamma:110.5, cnt:344
gamma:120.5, cnt:344
gamma:130.5, cnt:344
gamma:140.5, cnt:344
gamma:150.5, cnt:344
gamma:160.5, cnt:344
gamma:170.5, cnt:344
gamma:180.5, cnt:344
gamma:190.5, cnt:344
gamma:200.5, cnt:344
