1. Для датасета животных обучить MLP.
2. Использовать Custom Dataset, Sampler, collate_fn
3. Сделать различную предобработку фичей
4. Подключить для логирования tensorboard и/или mlflow
5. Не забыть разделить выборку на train и valid
6. Получить точность не ниже 65%.

In [145]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from tqdm.notebook import tqdm

import torch

print(torch.__version__)

import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

from torch.utils.data import DataLoader, Dataset, Sampler
from torch.utils.data.dataloader import default_collate
from torch.utils.tensorboard import SummaryWriter
from pytorch_lightning.metrics import Accuracy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

1.6.0+cpu


In [130]:
class CustomDataset(Dataset):
    # Конструктор, где считаем датасет
    def __init__(self, X, target):

        weekday_columns = ['Weekday_0', 'Weekday_1', 'Weekday_2',
                           'Weekday_3', 'Weekday_4', 'Weekday_5', 'Weekday_6']
        weekdays = np.argmax(X[weekday_columns].values, axis=1)

        X.drop(weekday_columns, axis=1, inplace=True)

        
        X['Weekday_cos'] = np.cos(2 * np.pi / 7.) * weekdays
        X['Weekday_sin'] = np.sin(2 * np.pi / 7.) * weekdays
        X['Weekday_tan'] = np.tan(2 * np.pi / 7.) * weekdays

        X['Hour_cos'] = np.cos(2 * np.pi / 24.) * X['Hour'].values
        X['Hour_sin'] = np.sin(2 * np.pi / 24.) * X['Hour'].values
        X['Hour_tan'] = np.tan(2 * np.pi / 24.) * X['Hour'].values

        X['Month_cos'] = np.cos(2 * np.pi / 12.) * X['Month'].values
        X['Month_sin'] = np.sin(2 * np.pi / 12.) * X['Month'].values
        X['Month_tan'] = np.tan(2 * np.pi / 12.) * X['Month'].values

        X['Gender'] = np.argmax(X[['Sex_Female', 'Sex_Male', 'Sex_Unknown']].values, axis=1)
        X['SexStatus'] = np.argmax(X[['SexStatus_Flawed', 'SexStatus_Intact', 'SexStatus_Unknown']].values, axis=1)
        X['Weekday'] = weekdays
        X['Breed'] = np.argmax(X[['Breed_Chihuahua Shorthair Mix', 'Breed_Domestic Medium Hair Mix',
                                'Breed_Domestic Shorthair Mix', 'Breed_German Shepherd Mix', 'Breed_Labrador Retriever Mix',
                                 'Breed_Pit Bull Mix', 'Breed_Rare']].values, axis=1)
        X['Hair'] = np.argmax(X[['Shorthair', 'Longhair']].values, axis=1)

        X.drop(['Sex_Female', 'Sex_Male', 'Sex_Unknown', 'SexStatus_Flawed', 'SexStatus_Intact', 'SexStatus_Unknown',
               'Breed_Chihuahua Shorthair Mix', 'Breed_Domestic Medium Hair Mix', 'Breed_Domestic Shorthair Mix',
                'Breed_German Shepherd Mix', 'Breed_Labrador Retriever Mix', 'Breed_Pit Bull Mix', 'Shorthair', 'Longhair'], 
               axis=1, inplace=True)

        target = target.iloc[:, :].values
        target[target == 'Died'] = 'Euthanasia'

        le = LabelEncoder()
        self.y = le.fit_transform(target)

        self.X = X.values

        self.columns = X.columns.values

        self.embedding_columns = ['Gender', 'SexStatus', 'Weekday', 'Breed', 'Hair']
        self.nrof_emb_categories = 22
        self.numeric_columns = ['IsDog', 'Age', 'HasName', 'NameLength', 'NameFreq', 'MixColor', 'ColorFreqAsIs',
                                'ColorFreqBase', 'TabbyColor', 'MixBreed', 'Domestic', 
                                'Year', 'Day', 'Weekday_cos', 'Weekday_sin', 'Weekday_tan',
                                'Hour_cos', 'Hour_sin', 'Hour_tan', 'Month_cos', 'Month_sin', 'Month_tan']

        return

    def __len__(self):
        return len(self.X)

    # Переопределяем метод,
    # который достает по индексу наблюдение из датасет
    def __getitem__(self, idx):

        row = self.X[idx, :]

        row = {col: torch.tensor(row[i]) for i, col in enumerate(self.columns)}

        return row, self.y[idx]

In [64]:
class CustomSampler(Sampler):

    # Конструктор, где инициализируем индексы элементов
    def __init__(self, data):
        self.data_indices = np.arange(len(data))

        shuffled_indices = np.random.permutation(len(self.data_indices))

        self.data_indices = np.ascontiguousarray(self.data_indices)[shuffled_indices]

        return

    def __len__(self):
        return len(self.data_indices)

    # Возращает итератор,
    # который будет возвращать индексы из перемешанного датасета
    def __iter__(self):
        return iter(self.data_indices)

In [None]:
'Gender': tensor(1., dtype=torch.float64), 
'SexStatus': tensor(0., dtype=torch.float64),
'Weekday': tensor(5., dtype=torch.float64),
'Breed': tensor(2., dtype=torch.float64),
'Hair': tensor(0., dtype=torch.float64)

In [140]:
def collate(batch):
    for i in range(len(batch)):
        data = batch[i][0]
        data['SexStatus'] += 3
        data['Weekday'] += 6
        data['Breed'] += 13
        data['Hair'] += 20
        batch[i] = (data, batch[i][1])
    return default_collate(batch)

In [71]:
def create_data_loader(train_dataset, valid_dataset, test_dataset, 
                       train_sampler, valid_sampler, test_sampler):
    
    train_loader = DataLoader(dataset=train_dataset, sampler=train_sampler,
                              batch_size=BATCH_SIZE, collate_fn=collate,
                              shuffle=False)
    
    valid_loader = DataLoader(dataset=valid_dataset, sampler=valid_sampler,
                              batch_size=BATCH_SIZE, collate_fn=collate,
                              shuffle=False)

    test_loader = DataLoader(dataset=test_dataset, sampler=test_sampler,
                             batch_size=BATCH_SIZE, collate_fn=collate,
                             shuffle=False)

    return train_loader, valid_loader, test_loader

In [133]:
class MLPNet(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, nrof_cat, emb_dim,
                 emb_columns, numeric_columns):
        super(MLPNet, self).__init__()
        self.emb_columns = emb_columns
        self.numeric_columns = numeric_columns

        self.emb_layer = torch.nn.Embedding(nrof_cat, emb_dim)

        self.feature_bn = torch.nn.BatchNorm1d(input_size)

        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear1.apply(self.init_weights)
        self.bn1 = torch.nn.BatchNorm1d(hidden_size)

        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear2.apply(self.init_weights)
        self.bn2 = torch.nn.BatchNorm1d(hidden_size)

        self.linear3 = torch.nn.Linear(hidden_size, output_size)

    def init_weights(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform(m.weight)
            # m.bias.data.fill_(0.001)

    def forward(self, x):
        emb_output = self.emb_layer(torch.tensor(pd.DataFrame(x)[self.emb_columns].values, dtype=torch.int64))
        emb_output = emb_output.permute((0, 2, 1)).mean(-1)
        numeric_feats = torch.tensor(pd.DataFrame(x)[self.numeric_columns].values, dtype=torch.float32)
        
        concat_input = torch.cat([numeric_feats, emb_output], dim=1)
        output = self.feature_bn(concat_input)

        output = self.linear1(output)
        output = self.bn1(output)
        output = torch.relu(output)

        output = self.linear2(output)
        output = self.bn2(output)
        output = torch.relu(output)

        output = self.linear3(output)
        predictions = torch.softmax(output, dim=1)

        return predictions

In [83]:
train_writer = SummaryWriter('./logs/train')
valid_writer = SummaryWriter('./logs/valid')

In [156]:
def pipeline(HIDDEN_SIZE, OUTPUT_SIZE, LEARNING_RATE, EPOCHS,
            BATCH_SIZE, EMBEDDING_SIZE):
    ###
    def train_inference():
    
        train_writer.add_text('LEARNING_RATE', str(LEARNING_RATE))
        train_writer.add_text('INPUT_SIZE', str(INPUT_SIZE))
        train_writer.add_text('HIDDEN_SIZE', str(HIDDEN_SIZE))
        train_writer.add_text('OUTPUT_SIZE', str(OUTPUT_SIZE))
        train_writer.add_text('EMBEDDING_SIZE', str(EMBEDDING_SIZE))
        train_writer.add_text('BATCH_SIZE', str(BATCH_SIZE))
        train_writer.add_text('EPOCHS', str(EPOCHS))
        
        valid_writer.add_text('LEARNING_RATE', str(LEARNING_RATE))
        valid_writer.add_text('INPUT_SIZE', str(INPUT_SIZE))
        valid_writer.add_text('HIDDEN_SIZE', str(HIDDEN_SIZE))
        valid_writer.add_text('OUTPUT_SIZE', str(OUTPUT_SIZE))
        valid_writer.add_text('EMBEDDING_SIZE', str(EMBEDDING_SIZE))
        valid_writer.add_text('BATCH_SIZE', str(BATCH_SIZE))
        valid_writer.add_text('EPOCHS', str(EPOCHS))
        
        step = 0
        for epoch in tqdm(range(EPOCHS)):
            model.train()

            for features, label in train_loader:
                # Reset gradients
                optimizer.zero_grad()
                
                label = label.long()
                output = model(features)
                # Calculate error and backpropagate
                loss = criterion(output, label)
                loss.backward()
                acc = accuracy(output, label).item()
                
                train_writer.add_scalar('CrossEntropyLoss', loss, step)
                train_writer.add_scalar('Accuracy', acc, step)
                train_writer.add_histogram('hidden_layer_1', model.linear1.weight.data, step)
                train_writer.add_histogram('hidden_layer_2', model.linear2.weight.data, step)
                train_writer.add_histogram('hidden_layer_3', model.linear3.weight.data, step)

                # Update weights with gradients
                optimizer.step()
                
            model.eval()

            for features, label in valid_loader:
                
                label = label.long()
                output = model(features)
                # Calculate error and backpropagate
                loss = criterion(output, label)
                acc = accuracy(output, label).item()
                
                valid_writer.add_scalar('CrossEntropyLoss', loss, step)
                valid_writer.add_scalar('Accuracy', acc, step)

            step += 1

            if step % 20 == 0:
                print('EPOCH %d STEP %d : valid_loss: %f valid_acc: %f' %
                      (epoch, step, loss.item(), acc))
    
    ###
    def test_inference():
        
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():

            for features, label in tqdm(test_loader):

                output = model(features)
                output = output.argmax(dim=1)
                
                total += len(label)
                correct += (output == label).sum().item()
                
        return "\n Accuracy: " + str(correct / total * 100)

                    
    ###
    X = pd.read_csv(r"C:\Users\Redmi\Documents\GitHub\SiriusDL\week06\data\X_cat.csv", sep='\t', index_col=0)
    target = pd.read_csv(r"C:\Users\Redmi\Documents\GitHub\SiriusDL\week06\data\y_cat.csv", 
                         sep='\t', index_col=0, names=['status'])  # header=-1,
    
    X_train, X_test, y_train, y_test = train_test_split(X.values, target,
                                                        test_size=0.2, stratify=target, random_state=42)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                        test_size=0.2, stratify=y_train, random_state=42)
    
    train_dataset = CustomDataset(pd.DataFrame(X_train, columns=X.columns), y_train)
    valid_dataset = CustomDataset(pd.DataFrame(X_val, columns=X.columns), y_val)
    test_dataset = CustomDataset(pd.DataFrame(X_test, columns=X.columns), y_test)
    
    train_sampler = CustomSampler(train_dataset.X)
    valid_sampler = CustomSampler(valid_dataset.X)
    test_sampler = CustomSampler(test_dataset.X)
    
    train_loader, valid_loader, test_loader = create_data_loader(train_dataset, valid_dataset, test_dataset,
                                                                train_sampler, valid_sampler, test_sampler)
    
    INPUT_SIZE = EMBEDDING_SIZE + len(train_dataset.numeric_columns)

    model = MLPNet(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, train_dataset.nrof_emb_categories,
                   EMBEDDING_SIZE,
                   train_dataset.embedding_columns, train_dataset.numeric_columns)

    criterion = nn.CrossEntropyLoss()
    accuracy = Accuracy()

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
    train_inference()
    return test_inference()

In [157]:
OUTPUT_SIZE = 5
LEARNING_RATE = 1e-2
EPOCHS = 100
BATCH_SIZE = 256
EMBEDDING_SIZE = 5

In [158]:
pipeline(HIDDEN_SIZE, OUTPUT_SIZE, LEARNING_RATE, EPOCHS, BATCH_SIZE, EMBEDDING_SIZE)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))

EPOCH 19 STEP 20 : valid_loss: 1.210879 valid_acc: 0.701657
EPOCH 39 STEP 40 : valid_loss: 1.227641 valid_acc: 0.668508
EPOCH 59 STEP 60 : valid_loss: 1.204950 valid_acc: 0.696133
EPOCH 79 STEP 80 : valid_loss: 1.184435 valid_acc: 0.718232
EPOCH 99 STEP 100 : valid_loss: 1.182559 valid_acc: 0.723757



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))




'\n Accuracy: 65.86232697343809'