In [None]:
!pip install torchsummary
!pip install mlencoders

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from mlencoders.target_encoder import TargetEncoder
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary

warnings.filterwarnings("ignore")

# Загрузка данных

In [None]:
raw_data = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', sep=',', verbose=True)

In [None]:
raw_data.head(2)

# Целевая переменная 

In [None]:
def weird_division(n, d):
    if d:
        return n / d
    else:
        return 0

raw_data['CPM'] = raw_data.apply(lambda x: weird_division(((x['total_revenue'] * 100)), x['measurable_impressions']) * 1000, axis=1)

# Удаление ненужных признаков

In [None]:
def features_for_drop(df, cols):
    n = []
    f = []
    for col in cols:
        if len(df[col].unique()) < 2: 
            n.append(len(df[col].unique()))
            f.append(col)
    return dict(zip(f, n))

features_for_drop(raw_data, raw_data.columns)

In [None]:
clean_data = raw_data.drop(['revenue_share_percent', 'integration_type_id', 'total_revenue', 'measurable_impressions'], axis=1)

# Деление выборки на тренировочную и тестовую

In [None]:
X_train = clean_data[clean_data['date'].between('2019-06-01 00:00:00', '2019-06-21 00:00:00')]
X_test = clean_data[clean_data['date'].between('2019-06-22 00:00:00', '2019-06-30 00:00:00')]

X_train_95_per = X_train['CPM'].quantile(0.95)
X_test_95_per = X_test['CPM'].quantile(0.95)

X_test = X_test[(X_test['CPM'] >= 0) & (X_test['CPM'] < X_test_95_per)]

X_valid = X_train.sample(frac=0.1, random_state=1234, replace=False)
X_train = X_train.loc[list(set(X_train.index) - set(X_valid.index)), :]

X_train = X_train[X_train['CPM'] < X_train_95_per]
X_valid = X_valid[X_valid['CPM'] < X_train_95_per]

# Кодирование категориальных признаков 

In [None]:
enc = TargetEncoder(cols=['site_id',
                          'ad_type_id',
                          'geo_id',
                          'device_category_id',
                          'advertiser_id',
                          'order_id',
                          'line_item_type_id',
                          'os_id',
                          'monetization_channel_id',
                          'ad_unit_id',
                          'total_impressions',
                          'viewable_impressions',
                         ],
                   )

In [None]:
X_train_enc = enc.fit_transform(X_train, X_train['CPM'])
X_valid_enc = enc.transform(X_valid)
X_test_enc = enc.transform(X_test)

In [None]:
X_train_enc.head(2)

In [None]:
y_train = X_train_enc['CPM'].values
X_train = X_train_enc.drop(['CPM', 'date'], axis=1).values

y_val = X_valid_enc['CPM'].values
X_val = X_valid_enc.drop(['CPM', 'date'], axis=1).values

y_test = X_test_enc['CPM'].values
X_test = X_test_enc.drop(['CPM', 'date'], axis=1).values

# Масштабирование признаков

In [None]:
scaler = MinMaxScaler()

X_train_sc = scaler.fit_transform(X_train)
X_val_sc = scaler.transform(X_val)
X_test_sc = scaler.transform(X_test)

# Создание нейросетевой модели (полносвязная сеть)

In [None]:
device = torch.device('cpu')
device

In [None]:
class RegressionDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

    
train_dataset = RegressionDataset(torch.from_numpy(X_train_sc).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val_sc).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test_sc).float(), torch.from_numpy(y_test).float())

In [None]:
EPOCHS = 30
BATCH_SIZE = 1024
LEARNING_RATE = 0.001

NUM_FEATURES = X_train_sc.shape[1]

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [None]:
class MultipleRegression(nn.Module):
    def __init__(self, num_features):
        super(MultipleRegression, self).__init__()
        
        self.layer_1 = nn.Linear(num_features, 30)
        self.bn1 = nn.BatchNorm1d(num_features=30)
        self.layer_2 = nn.Linear(30, 15)
        self.bn2 = nn.BatchNorm1d(num_features=15)
        self.layer_3 = nn.Linear(15, 10)
        self.bn3 = nn.BatchNorm1d(num_features=10)
        self.layer_out = nn.Linear(10, 1)       
        self.relu = nn.ReLU()
    
    def forward(self, inputs):
        x = self.relu(self.bn1(self.layer_1(inputs)))
        x = self.relu(self.bn2(self.layer_2(x)))
        x = self.relu(self.bn3(self.layer_3(x)))
        x = self.layer_out(x)
        return x

In [None]:
model = MultipleRegression(NUM_FEATURES)
model.to(device)

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
summary(model=model, input_size=(NUM_FEATURES, ), device='cpu')

In [None]:
loss_stats = {'train': [], "val": [], }

In [None]:
print("Start training")

for e in tqdm.tqdm_notebook(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        y_train_pred = model(X_train_batch)
        train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
        train_loss.backward()
        optimizer.step()
        train_epoch_loss += train_loss.item()
              
    # VALIDATION    
    with torch.no_grad():
        val_epoch_loss = 0
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            y_val_pred = model(X_val_batch)            
            val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
            val_epoch_loss += val_loss.item()
            
    loss_stats['train'].append(train_epoch_loss / len(train_loader))
    loss_stats['val'].append(val_epoch_loss / len(val_loader))                              
    
    print(f'Epoch {e+0:03}: | Train MSE: {train_epoch_loss / len(train_loader):.5f} | Val MSE: {val_epoch_loss / len(val_loader):.5f}')
    
print("Finish training")

# Оценка качества модели на тестовой выборке

In [None]:
y_pred_list = []

with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_pred_list.append(y_test_pred.cpu().numpy())
        
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
print('MSE on test data = {}'.format(mean_squared_error(y_test, y_pred_list)))