In [2]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler, RobustScaler
from sklearn.ensemble import IsolationForest 
from sklearn import svm  
from sklearn.mixture import GaussianMixture 
from sklearn.isotonic import IsotonicRegression
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.decomposition import PCA
from transformers import *
import time 
import datetime
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt



In [6]:
train.columns

Index(['air_inflow', 'air_end_temp', 'out_pressure', 'motor_current',
       'motor_rpm', 'motor_temp', 'motor_vibe', 'type'],
      dtype='object')

In [3]:
train = pd.read_csv("train_data.csv") 
test = pd.read_csv("test_data.csv") 

In [3]:
# shuffle dataframe 
train = train.sample(frac=1, random_state=42).reset_index(drop=True) 

In [4]:
train.columns

Index(['air_inflow', 'air_end_temp', 'out_pressure', 'motor_current',
       'motor_rpm', 'motor_temp', 'motor_vibe', 'type'],
      dtype='object')

In [5]:
train.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,3.04,60.24,0.7,39.34,3219.0,75.77,3.79,5
1,3.15,61.63,0.7,40.7,3330.0,77.0,3.85,7


In [6]:
train["log_air_inflow"] = np.log1p(train["air_inflow"].values) 
train["log_air_end_temp"] = np.log1p(train["air_end_temp"].values) 
train["log_motor_current"] = np.log1p(train["motor_current"].values) 
train["log_motor_rpm"] = np.log1p(train["motor_rpm"].values) 
train["log_motor_vibe"] = np.log1p(train["motor_vibe"].values) 

train["specific_power"] = train["motor_current"] / train["air_inflow"] 
train["compression_ratio"] = train["out_pressure"] / train["air_end_temp"] 
train["motor_efficiency"] = train["motor_rpm"] / train["motor_current"]  
train["temperature_difference"] = train["air_end_temp"] - train["motor_temp"] 
train["pr_ratio"] = train["out_pressure"] / train["motor_rpm"] 
train["power_consumption"] = train["motor_current"] * train["motor_rpm"] 
train["vr_ratio"] = train["motor_vibe"] / train["motor_rpm"] 
train["fp_ratio"] = train["air_inflow"] / train["out_pressure"] 
train["tm_ratio"] = train["air_end_temp"] / train["motor_temp"] 
train["motor_load_factor"] = train["motor_current"] / train["motor_temp"] 
train["pv_ratio"] = train["out_pressure"] / train["motor_vibe"] 
train["temperature_change_rate"] = (train["air_end_temp"] - train["motor_temp"]) / train["air_inflow"] 

# get horsepower features 
hp = {0: 30, 1: 20, 2: 10, 3: 50, 4: 30, 5: 30, 6: 30, 7: 30} 
hp_values = [] 
types = train["type"].values 
for i in range(len(types)): 
    hp_values.append(hp[types[i]]) 

train["hp"] = hp_values 
train["horsepower_efficiency"] = train["motor_current"] / train["hp"] 

In [7]:
test["log_air_inflow"] = np.log1p(test["air_inflow"].values) 
test["log_air_end_temp"] = np.log1p(test["air_end_temp"].values) 
test["log_motor_current"] = np.log1p(test["motor_current"].values) 
test["log_motor_rpm"] = np.log1p(test["motor_rpm"].values) 
test["log_motor_vibe"] = np.log1p(test["motor_vibe"].values) 

test["specific_power"] = test["motor_current"] / test["air_inflow"] 
test["compression_ratio"] = test["out_pressure"] / test["air_end_temp"] 
test["motor_efficiency"] = test["motor_rpm"] / test["motor_current"]  
test["temperature_difference"] = test["air_end_temp"] - test["motor_temp"] 
test["pr_ratio"] = test["out_pressure"] / test["motor_rpm"] 
test["power_consumption"] = test["motor_current"] * test["motor_rpm"] 
test["vr_ratio"] = test["motor_vibe"] / test["motor_rpm"] 
test["fp_ratio"] = test["air_inflow"] / test["out_pressure"] 
test["tm_ratio"] = test["air_end_temp"] / test["motor_temp"] 
test["motor_load_factor"] = test["motor_current"] / test["motor_temp"] 
test["pv_ratio"] = test["out_pressure"] / test["motor_vibe"] 
test["temperature_change_rate"] = (test["air_end_temp"] - test["motor_temp"]) / test["air_inflow"] 

test_hp_values = [] 
test_types = test["type"].values 
for i in range(len(test_types)): 
    test_hp_values.append(hp[test_types[i]]) 
    
test["hp"] = test_hp_values 
test["horsepower_efficiency"] = test["motor_current"] / test["hp"] 

In [10]:
scaler = StandardScaler() 
X_train = scaler.fit_transform(train)  

In [11]:
class AutoEncoder(nn.Module): 
    def __init__(self, input_dim): 
        super(AutoEncoder, self).__init__() 
        self.input_dim = input_dim
        self.encoder = nn.Sequential(
            nn.Linear(self.input_dim, 10), 
            nn.Tanh(), 
            nn.Linear(10, 5) 
        ) 
        self.decoder = nn.Sequential(
            nn.Linear(5, 10),
            nn.Tanh(), 
            nn.Linear(10, self.input_dim) 
        ) 
    def forward(self, x): 
        encoded = self.encoder(x) 
        decoded = self.decoder(encoded) 
        return decoded

In [12]:
batch_size = 64
X_train = torch.tensor(X_train).float() 
X_train.shape

torch.Size([2463, 27])

In [13]:
train_data = TensorDataset(X_train) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)  

In [16]:
device = torch.device("cuda") 
input_dim = X_train.shape[1] 
d_model = 12
nhead = 4 
num_layers = 2 
dim_feedforward = 32

model = AutoEncoder(input_dim)
model = model.to(device) 
optimizer = AdamW(model.parameters(), lr=1e-3)
loss_func = nn.MSELoss() 
best_loss = 99999999999 
best_epoch = -1 
epochs = 4000 
total_steps = len(train_dataloader) * epochs 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps) 

model.zero_grad() 
for epoch in tqdm(range(epochs), position=0, leave=True, desc="Epochs", total=epochs): 
    train_loss = 0
    model.train() 
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) 
        b_inputs = batch[0] 
        decoded = model(b_inputs) 
        loss = loss_func(decoded, b_inputs) 
        train_loss += loss.item() 
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
        optimizer.step() 
        scheduler.step() 
        model.zero_grad() 
    avg_train_loss = train_loss / len(train_dataloader) 
    if avg_train_loss < best_loss:
        best_loss = avg_train_loss 
        best_epoch = epoch 
        torch.save(model.state_dict(), "FE_CLEAN.pt") 
    
    if epoch%200 == 0 and epoch > 0: 
        print(f"current average loss : {avg_train_loss} | current best loss : {best_loss}")

print(f"best loss: {best_loss} | best epoch: {best_epoch}") 

Epochs:   0%|          | 0/4000 [00:00<?, ?it/s]

current average loss : 0.0018335529916489928 | current best loss : 0.0018335529916489928
current average loss : 0.0008782510644195076 | current best loss : 0.0008722459807848701
current average loss : 0.00040704290791510197 | current best loss : 0.00039637133811946766
current average loss : 0.00031946823582984507 | current best loss : 0.00031472397229383484
current average loss : 0.0002981141823021552 | current best loss : 0.0002823344339282276
current average loss : 0.00026359785130271356 | current best loss : 0.00026359785130271356
current average loss : 0.0002673309649346778 | current best loss : 0.00024997892204481055
current average loss : 0.0002530474396338925 | current best loss : 0.00024007402326111705
current average loss : 0.00023695571163000586 | current best loss : 0.0002336747883162342
current average loss : 0.0002280120228160507 | current best loss : 0.0002280120228160507
current average loss : 0.00022574130278558304 | current best loss : 0.00022339784444715732
current av

In [17]:
X_test = scaler.transform(test)  
batch_size = 1 
X_test = torch.tensor(X_test).float() 
test_data = TensorDataset(X_test) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)  

In [18]:
device = torch.device("cuda") 
input_dim = X_test.shape[1]
best_model = AutoEncoder(input_dim) 
checkpoint = torch.load("FE_CLEAN.pt") 
print(best_model.load_state_dict(checkpoint))  
best_model.to(device)

<All keys matched successfully>


AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=27, out_features=10, bias=True)
    (1): Tanh()
    (2): Linear(in_features=10, out_features=5, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=10, bias=True)
    (1): Tanh()
    (2): Linear(in_features=10, out_features=27, bias=True)
  )
)

In [22]:
train_data = TensorDataset(X_train) 
train_sampler = SequentialSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=1)   

train_mse_scores = [] 

with torch.no_grad():
    for step, batch in tqdm(enumerate(train_dataloader), position=0, leave=True): 
        batch = tuple(t.to(device) for t in batch) 
        b_inputs = batch[0]   
        decoded = best_model(b_inputs) 
        loss = loss_func(decoded, b_inputs)  
        train_mse_scores.append(loss.item())  

0it [00:00, ?it/s]

In [19]:
loss_func = nn.MSELoss() 

best_model.eval() 

test_mse_scores = [] 

with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader), position=0, leave=True): 
        batch = tuple(t.to(device) for t in batch) 
        b_inputs = batch[0]   
        decoded = best_model(b_inputs) 
        loss = loss_func(decoded, b_inputs)  
        test_mse_scores.append(loss.item())  

0it [00:00, ?it/s]

In [23]:
full_mse_scores = train_mse_scores + test_mse_scores 
len(full_mse_scores) 

9852

In [25]:
len(train_mse_scores)

2463

In [24]:
def mad_score(points): 
    m = np.median(points) 
    ad = np.abs(points - m) 
    mad = np.median(ad) 
    return 0.6745 * ad / mad 

z_scores = mad_score(full_mse_scores)

In [28]:
gammas = [3.5, 5.5, 10.5, 20.5, 30.5, 40.5, 50.5, 60.5, 70.5, 80.5, 90.5, 100.5, 110.5, 120.5, 130.5, 140.5, 150.5, 160.5, 170.5, 180.5, 190.5, 200.5, 210.5, 220.5, 230.5, 240.5, 300.5, 1000.5] 

for gamma in gammas: 
    outliers = z_scores > gamma 
    outliers = outliers.astype(int)  
    outliers = outliers[2463:] 
    submission = pd.read_csv("answer_sample.csv") 
    submission["label"] = outliers
    
    cnt = 0 
    for i in range(len(outliers)): 
        if outliers[i] == 1: 
            cnt += 1 
    
    if gamma == 300.5:
        submission.to_csv(f"FEAE_{gamma}_cnt_{cnt}.csv",index=False)  
    
    print(f"gamma:{gamma}, cnt:{cnt}") 


gamma:3.5, cnt:1442
gamma:5.5, cnt:933
gamma:10.5, cnt:462
gamma:20.5, cnt:355
gamma:30.5, cnt:344
gamma:40.5, cnt:344
gamma:50.5, cnt:344
gamma:60.5, cnt:344
gamma:70.5, cnt:344
gamma:80.5, cnt:344
gamma:90.5, cnt:344
gamma:100.5, cnt:344
gamma:110.5, cnt:344
gamma:120.5, cnt:344
gamma:130.5, cnt:344
gamma:140.5, cnt:344
gamma:150.5, cnt:344
gamma:160.5, cnt:344
gamma:170.5, cnt:344
gamma:180.5, cnt:344
gamma:190.5, cnt:344
gamma:200.5, cnt:344
gamma:210.5, cnt:344
gamma:220.5, cnt:344
gamma:230.5, cnt:344
gamma:240.5, cnt:344
gamma:300.5, cnt:344
gamma:1000.5, cnt:340


In [17]:
prev = pd.read_csv("FEAE_300.5_cnt_344.csv") 

labels = [] 

prev_labels = prev["label"].values 

for i in range(len(prev_labels)): 
    if prev_labels[i] == 1: 
        labels.append(i) 

In [20]:
outlier_df = test.iloc[labels] 


In [21]:
for i in range(8): 
    print(outlier_df[outlier_df["type"]==i].shape[0])

143
0
27
26
13
30
101
4
