# Network model (MLP)

In [None]:
% matplotlib inline
% load_ext autoreload
% autoreload 2

## Imports

In [None]:
import os

import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from src.data.dataset import MovieDataset
from src.utils.const import DATA_DIR, SEED

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

### Repeatability

In [None]:
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

## Import final dataset

In [None]:
df = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

## Work with Dataset

In [None]:
dataset = MovieDataset(df)

In [None]:
samples = np.arange(len(dataset))
train_idx_tmp, test_idx = train_test_split(samples, test_size=0.2, stratify=dataset.y)
train_idx, val_idx = train_test_split(train_idx_tmp, test_size=0.1, stratify=dataset.y[train_idx_tmp])

### Scaler

In [None]:
def overview(data):
    train_data = data.X[train_idx]
    test_data = data.X[test_idx]
    val_data = data.X[val_idx]

    plt.scatter(
        train_data[:, dataset.map_columns['year']],
        train_data[:, dataset.map_columns['title_length']],
        c='r'
    )
    plt.scatter(
        test_data[:, dataset.map_columns['year']],
        test_data[:, dataset.map_columns['title_length']],
        c='g'
    )
    plt.scatter(
        val_data[:, dataset.map_columns['year']],
        val_data[:, dataset.map_columns['title_length']],
        c='b'
    )

In [None]:
overview(dataset)

In [None]:
scaler = MinMaxScaler()
# scaler = StandardScaler()
features = [
    dataset.map_columns['year'],
    dataset.map_columns['title_length'],
    dataset.map_columns['runtime'],
    dataset.map_columns['rating_count']
]
dataset.scale(train_idx, test_idx, val_idx, scaler, features)

In [None]:
overview(dataset)

### Normalization

In [None]:
def overview_norm(data):
    train_data = data.X[train_idx]
    test_data = data.X[test_idx]
    val_data = data.X[val_idx]

    plt.scatter(
        train_data[:, dataset.map_columns['year']],
        train_data[:, dataset.map_columns['title_length']],
        c='r'
    )
    plt.scatter(
        test_data[:, dataset.map_columns['year']],
        test_data[:, dataset.map_columns['title_length']],
        c='g'
    )
    plt.scatter(
        val_data[:, dataset.map_columns['year']],
        val_data[:, dataset.map_columns['title_length']],
        c='b'
    )

In [None]:
dataset.normalize(train_idx, test_idx, val_idx, norm='max')

In [None]:
overview_norm(dataset)

In [3]:
% load_ext tensorboard

In [4]:
% tensorboard --logdir../ src / models / network / logs

Launching TensorBoard...

In [48]:
from src.models.config import param_layers, param_grid_mlp
import itertools

hyper_parameters_model = itertools.product(
    param_layers['input_act'],
    param_layers['hidden_act'],
    param_layers['hidden_size'],
    param_layers['num_hidden_layers'],
    param_layers['dropout'],
    param_layers['batch_norm'],
    param_layers['output_fn'],
    param_grid_mlp['starting_lr'],
    param_grid_mlp['num_epochs'],
    param_grid_mlp['batch_size'],
    param_grid_mlp['optim'],
    param_grid_mlp['momentum'],
    param_grid_mlp['weight_decay'],
)

In [45]:
def get_set_params(prod, num_sets: int, selected_set: int):
    if selected_set > num_sets:
        selected_set = num_sets - 1


    def chunkify(lst, n):
        return [lst[i::n] for i in range(n)]


    tagged_list = [(index,) + element for index, element in enumerate(list(prod))]
    return chunkify(tagged_list, num_sets)[selected_set]

In [49]:
get_set_params(hyper_parameters_model, 2, 0)

[(0,
  ReLU(),
  LeakyReLU(negative_slope=0.01),
  512,
  5,
  0.2,
  False,
  None,
  0.001,
  200,
  128,
  torch.optim.adam.Adam,
  0.9,
  1e-06)]