# Model training

## Install and import dependencies

In [None]:
!pip -q install "numpy==1.26.4" "tensorflow==2.18.0"

In [2]:
import pandas as pd

## How does the dataset look like?

### These are the features we have available in the dataset
**name** - (String) The name of the furniture.  
**category** - (String) What type of furniture it is. This will be converted to multiple columns with 1 or 0 if it's that category or not (to make it easy for the AI to read).  
**price** - (Float) The current price of the product.  
**old_price** - (Float) A previous price of the product, same as the current price in many cases.  
**sellable_online** - (Bool) If the item is sellable online, this will be True.  
**other_colors** - (Bool) If the item has other color variants, this will be True.  
**width** - (Float) The width of the item, if applicable.  
**depth** - (Float) The depth of the item, if applicable.  
**height** - (Float) The height of the item, if applicable.  
**discounted** - (Int) Takes 1 if the item is discounted, 0 otherwise.  
**width_d** - (Int) Takes 1 if it had a weight or 0 if it was NaN and was assigned a value through interpolation.  
**height_d** -  (Int) Takes 1 if it had a height or 0 if it was NaN and was assigned a value through interpolation.  
**depth_d** -  (Int) Takes 1 if it had a depth or 0 if it was NaN and was assigned a value through interpolation.  
**discount_amount** - (Float) How much to discount the item with.  
**size** - (Float) The total size of the item, 1 in case none of the shapes were given.  

In the cell below, you get some examples of how this looks like.

In [None]:
df = pd.read_csv("data/clean_OMEA_dataset.csv")
df.head()

## Time to pick your features!
Choose the features you want to include in your training.  
We need to choose input features as well as an output feature (we can't use Category as output).  

NOTE: You are not allowed to include both *discount_amount* and *old_price* in INPUT_FEATURES.  
Quiz: Why would this be a pointless excerice?

In [4]:
INPUT_FEATURES = [
    # "name", # The model can't handle freeform names
    "category",
    # "price",
    # "old_price",
    "sellable_online",
    "other_colors",
    # "short_description", # The model can't handle freeform descriptions
    "width",
    "height",
    "depth",
    "width_d",
    "height_d",
    "depth_d",
    "discounted",
    "discount_amount",
    "size"
]
PREDICT_TARGET = "price"
SAVE_FOLDER = "new_model"

## Create the training and test dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras import utils
import tensorflow as tf
import tensorflow.compat.v1 as tf1

import numpy as np
import pickle
from pathlib import Path

Let's start by processing our dataset.

In [None]:
items_to_scale = ['price', 'old_price', 'width', 'height', 'depth', 'discount_amount', 'size']
items_to_encode = ['sellable_online', 'other_colors']
items_to_one_hot_encode = ['category']

scalers = {}
encoders = {}

for i in items_to_scale:
    if i in INPUT_FEATURES or i == PREDICT_TARGET:
        scalers[i] = MinMaxScaler()
        df[[i]] = scalers[i].fit_transform(df[[i]])
    else:
        df = df.drop(i, axis=1)

for i in items_to_encode:
    if i in INPUT_FEATURES or i == PREDICT_TARGET:
        encoders[i] = LabelEncoder()
        df[i] = encoders[i].fit_transform(df[i])
    else:
        df = df.drop(i, axis=1)

for i in items_to_one_hot_encode:
    if i in INPUT_FEATURES or i == PREDICT_TARGET:
        df = pd.get_dummies(df, columns=[i])
    else:
        df = df.drop(i, axis=1)

df = df.drop(["name", "short_description"], axis=1)

df.head()

## Train and test split

Now we can split our dataset into training and testing.  
The training will be what we train our model on, and the test dataset will be what we later run the tests on.  
We purposfully don't show the test data to the model during training so that it can be used to see how well our model performs on unseen data, i.e. how well it can generalize.

In [7]:
df_train, df_test = train_test_split(df, shuffle=True, test_size=0.3, random_state=42)

## Create and train the model

In [8]:
tf.keras.utils.set_random_seed(112)

In [9]:
def create_model(my_learning_rate):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(len(df_train.columns)-1,)),
        tf.keras.layers.Dense(20, activation='relu', name='Hidden1'),
        tf.keras.layers.Dense(10, activation='relu', name='Hidden2'),
        tf.keras.layers.Dense(1, name='Output')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=my_learning_rate),
                  loss='mean_squared_error',
                  metrics=[tf.keras.metrics.MeanSquaredError()])
    return model

In [None]:
# Hyperparameters
learning_rate = 0.01
epochs = 20
batch_size = 2

# Create the model
model = create_model(learning_rate)

features = df_train.drop(PREDICT_TARGET, axis=1)
labels = df_train[[PREDICT_TARGET]]

# Fit the model
history = model.fit(features.values, labels.values, epochs=epochs, batch_size=batch_size)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Training Loss')
plt.legend()
plt.show()

## Save the artifacts

In [12]:
# Save the model and artifacts
Path(SAVE_FOLDER).mkdir(parents=True, exist_ok=True)

model.save(f"{SAVE_FOLDER}/model.keras")

with open(f"{SAVE_FOLDER}/scalers.pkl", "wb") as handle:
    pickle.dump(scalers, handle)
with open(f"{SAVE_FOLDER}/encoders.pkl", "wb") as handle:
    pickle.dump(encoders, handle)

df_train.to_parquet(f"{SAVE_FOLDER}/X_train.parquet")
df_test.to_parquet(f"{SAVE_FOLDER}/X_test.parquet")