## Importing packages and functions

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import copy
warnings.simplefilter("ignore")

## Preprocessing the data
### removing unused features

In [84]:
# Use pandas to loadinto a DataFrame
# Y1.csv doesn’t have a header so
# add one when loading the file
X1 = pd.read_csv("data/X1.csv")
Y1 = pd.read_csv("data/Y1.csv", header=None, names=['revenue'])

X = X1.drop(['Unnamed: 0', 'img_url', 'description', 'is_adult'], axis=1)

### Preprocessing functions for each used feature

In [85]:
"""
Creating empty DataFrame to start
"""
n_samples = X.shape[0]
data = pd.DataFrame()

In [86]:
"""
Keeping the directly usable features
"""
def get_directrly_usable_features(df):
    directly_usable_features = ["ratings", "n_votes"]
    for feature in directly_usable_features:
        df[feature] = X[feature]
    return df

In [87]:
"""
Dealing with the "production_year" feature
"""

def get_prod_year_feature(df, params):
    
    style = params["production_year_style"] # "per_quantile" / "per_period_length" / "no_period"
    if style != "no_period" :
        n_year_period = params["n_year_period"]

    # Removing previously computed categorie(s) for the "production_year" initial feature

    for feature in df.columns:
        if len(feature) >= 8 and (feature[:6] == "period" or feature == "production_year"):
            df.drop(feature, axis=1, inplace=True)

    # Creating new categorie(s) for the "production_year" initial feature

    prod_year = X["production_year"].copy()
    if style == "per_quantile" or style == "per_period_length":
        categories = np.ones((n_year_period, n_samples))

        if style == "per_quantile":
            thresholds = prod_year.quantile(np.arange(1, n_year_period) / n_year_period)
        else :
            thresholds = np.min(prod_year) + (np.max(prod_year) - np.min(prod_year))*np.arange(1, n_year_period)/n_year_period
        for i, threshold in enumerate(thresholds):
            categories[i+1] = (prod_year >= threshold).astype(int)
            categories[i] -= categories[i+1]
        for period in range(n_year_period):
            df["period {}".format(period)] = categories[period]
    elif style == "no_period":
        df["production_year"] = prod_year
    return df

In [88]:
"""
Dealing with the "runtime" feature

The problem is here that we have some missing values, we have to deal with it.
"""

def get_runtime_feature(df, params):

    # Add other smarter ways ?
    
    replace_type = params["runtime_replace_type"] # "zero" / "mean" / "median"
    
    runtime = X["runtime"].copy()
    if replace_type == "zero":
        runtime[runtime == "\\N"] = 0
    if replace_type == "mean":
        mean = np.mean(runtime[runtime != "\\N"].astype(float))
        runtime[runtime == "\\N"] = mean
    if replace_type == "median":
        median = np.median(runtime[runtime != "\\N"].astype(float))
        runtime[runtime == "\\N"] = median
    df["runtime"] = runtime.astype(float)    
    return df

In [89]:
"""
Dealing with the "studio" feature
"""

def get_studio_feature(df, params):
    use_PCA = params["studio_use_PCA"]
    if use_PCA :
        dim = params["studio_PCA_dim"]

    # Removing previously computed categorie(s) for the "studio" initial feature
    for feature in df.columns:
        if len(feature) >= 10 and feature[:10] == "studio_PC_":
            df.drop(feature, axis=1, inplace=True)

    # Creating new categorie(s) for the "studio" initial feature
    studio = copy.deepcopy(X["studio"])
    studio_labels = np.unique(studio)
    studio_features = np.zeros((len(studio_labels), n_samples))
    for i, label in enumerate(studio_labels) :
        studio_features[i] = (studio == label).astype(int)
    
    # Applying pca or not
    if use_PCA :
        pca = PCA(n_components=dim)
        out = pca.fit_transform(studio_features.T)
    else :
        normals = studio_features[np.count_nonzero(studio_features, axis=1) > 5].T
        outliers = np.sum(studio_features[np.count_nonzero(studio_features, axis=1) <= 5].T, axis = 1)
        out = np.zeros((normals.shape[0], normals.shape[1] + 1))
        out[:, :-1] = normals
        out[:, -1] = outliers
        dim = out.shape[1]
    
    df[["studio_PC_{}".format(i) for i in range(dim)]] = out
    return df

# Ya plein de warnings quand dim trop grand ou pas de PCA /: 
# jsp comment regler ça... en utilisant pd.concat ça tourne vraiment extrêmement lentement

In [90]:
"""
Dealing with the "genres" feature
"""

def get_genre_feature(df):

    X.loc[X["genres"] == "\\N", "genres"] = "Others"
    all_genres = X["genres"].copy()
    diff_genres = []

    for genres in np.unique(all_genres):
        for genre in genres.split(",") :
            if not genre in diff_genres :
                diff_genres.append(genre)

    for genre in diff_genres:
        df[genre] = [1 if genre in genres.split(",") else 0 for genres in all_genres]  
    return df

In [91]:
"""
Dealing with the "text_embeddings" feature
"""
def get_text_embedding_feature(df, params):
    
    output_dim = params["text_embedding_PCA_dim"] # output dimension of PCA

    # Removing previously computed categorie(s) for the "text_embedding" initial feature
    for feature in df.columns:
        if len(feature) >= 18 and feature[:18] == "text_embedding_PC_":
            df.drop(feature, axis=1, inplace=True)

    # Creating new categorie(s) for the "text_embedding" initial feature
    text_embeddings = X["text_embeddings"]
    input_dim = 768
    embeddings = np.zeros((n_samples, input_dim))
    for i, text_embedding in enumerate(text_embeddings):
        embeddings[i] = list(map(float,text_embedding[1:-1].split(",")))

    # applying PCA
    pca = PCA(n_components=output_dim)
    output = pca.fit_transform(embeddings)

    df[["text_embedding_PC_{}".format(i) for i in range(output_dim)]] = output
    return df

In [92]:
"""
Dealing with the "img_embeddings" feature
"""
def get_img_embedding_feature(df, params):
    output_dim = params["img_embedding_PCA_dim"] # output dimension of PCA

    # Removing previously computed categorie(s) for the "img_embedding" initial feature
    for feature in df.columns:
        if len(feature) >= 17 and feature[:17] == "img_embedding_PC_":
            df.drop(feature, axis=1, inplace=True)

    # Creating new categorie(s) for the "img_embedding" initial feature
    img_embeddings = X["img_embeddings"]
    input_dim = 2048
    embeddings = np.zeros((n_samples, input_dim))
    for i, img_embedding in enumerate(img_embeddings):
        embeddings[i] = list(map(float,img_embedding[1:-1].split(",")))

    # applying PCA
    pca = PCA(n_components=output_dim)
    output = pca.fit_transform(embeddings)

    df[["img_embedding_PC_{}".format(i) for i in range(output_dim)]] = output
    return df

## complete preprocessing functions

In [93]:
def create_preprocessed(params):
    df = pd.DataFrame()
    df = get_genre_feature(df)
    df = get_directrly_usable_features(df)
    df = get_prod_year_feature(df, params)
    df = get_studio_feature(df, params)
    df = get_runtime_feature(df, params)
    df = get_text_embedding_feature(df, params)
    df = get_img_embedding_feature(df, params)
    return df

In [94]:
def create_preprocessed_dict(params):
    final = {}
    year_dict = {}
    for val1 in params["production_year_style"] :
        for val2 in params["n_year_period"] :
            tmp = {}
            tmp["production_year_style"] = val1
            tmp["n_year_period"] = val2
            year_dict[val1+str(val2)] = get_prod_year_feature(pd.DataFrame(), tmp)
    final["year"] = year_dict
    
    studio_dict = {}
    for val1 in params["studio_use_PCA"] :
        for val2 in params["studio_PCA_dim"] :
            tmp = {}
            tmp["studio_use_PCA"] = val1
            tmp["studio_PCA_dim"] = val2
            studio_dict[str(val1)+str(val2)] = get_studio_feature(pd.DataFrame(), tmp)
    final["studio"] = studio_dict
    
    runtime_dict = {}
    for val in params["runtime_replace_type"] :
        tmp = {}
        tmp["runtime_replace_type"] = val
        runtime_dict[val] = get_runtime_feature(pd.DataFrame(), tmp)
    final["runtime"] = runtime_dict
    
    text_dict = {}
    for val in params["text_embedding_PCA_dim"] :
        tmp = {}
        tmp["text_embedding_PCA_dim"] = val
        text_dict[val] = get_text_embedding_feature(pd.DataFrame(), tmp)
    final["text"] = text_dict
    
    img_dict = {}
    for val in params["img_embedding_PCA_dim"] :
        tmp = {}
        tmp["img_embedding_PCA_dim"] = val
        img_dict[val] = get_img_embedding_feature(pd.DataFrame(), tmp)
    final["img"] = img_dict   
    final["genre"] = get_genre_feature(pd.DataFrame())
    final["base"] = get_directrly_usable_features(pd.DataFrame())
    return final

In [95]:
def create_preprocessed_from_dict(params_dict, params):
    frames = []
    frames.append(params_dict["genre"])
    frames.append(params_dict["base"])
    frames.append(params_dict["year"][params["production_year_style"] + str(params["n_year_period"])])
    frames.append(params_dict["studio"][str(params["studio_use_PCA"]) + str(params["studio_PCA_dim"])])
    frames.append(params_dict["runtime"][params["runtime_replace_type"]])
    frames.append(params_dict["text"][params["text_embedding_PCA_dim"]])
    frames.append(params_dict["img"][params["img_embedding_PCA_dim"]])
    return pd.concat(frames, axis=1)

## Examples of preprocessing

In [96]:
# Using single preprocessing

params = {
    "production_year_style" : "per_quantile", # "per_quantile" / "per_period_length" / "no_period"
    "n_year_period" : 5,
    "runtime_replace_type" : "mean", # "mean" / "zero" / "median"
    "studio_use_PCA" : True,
    "studio_PCA_dim" : 50,
    "text_embedding_PCA_dim" : 50,
    "img_embedding_PCA_dim" : 50
}

preprocessed_data = create_preprocessed(params)

In [97]:
# Using preprocessed dictionnary -> usefull to not compute the whole preprocessing at each iteration of the localsearch later

"""
all_params = {
    "production_year_style" : ["per_quantile", "per_period_length", "no_period"], # "per_quantile" / "per_period_length" / "no_period"
    "n_year_period" : [3, 5, 10],
    "runtime_replace_type" : ["mean", "zero"], # "mean" / "zero" / "median"
    "studio_use_PCA" : [True],
    "studio_PCA_dim" : [1, 10],
    "text_embedding_PCA_dim" : [1, 10],
    "img_embedding_PCA_dim" : [1, 10]
}

params = {
    "production_year_style" : "per_quantile", # "per_quantile" / "per_period_length" / "no_period"
    "n_year_period" : 5,
    "runtime_replace_type" : "mean", # "mean" / "zero"
    "studio_use_PCA" : True,
    "studio_PCA_dim" : 1,
    "text_embedding_PCA_dim" : 10,
    "img_embedding_PCA_dim" : 10
}

preprocessed_dict = create_preprocessed_dict(all_params)
preprocessed_data = create_preprocessed_from_dict(preprocessed_dict, params)
preprocessed_data_2 = create_preprocessed(params)
"""

'\nall_params = {\n    "production_year_style" : ["per_quantile", "per_period_length", "no_period"], # "per_quantile" / "per_period_length" / "no_period"\n    "n_year_period" : [3, 5, 10],\n    "runtime_replace_type" : ["mean", "zero"], # "mean" / "zero" / "median"\n    "studio_use_PCA" : [True],\n    "studio_PCA_dim" : [1, 10],\n    "text_embedding_PCA_dim" : [1, 10],\n    "img_embedding_PCA_dim" : [1, 10]\n}\n\nparams = {\n    "production_year_style" : "per_quantile", # "per_quantile" / "per_period_length" / "no_period"\n    "n_year_period" : 5,\n    "runtime_replace_type" : "mean", # "mean" / "zero"\n    "studio_use_PCA" : True,\n    "studio_PCA_dim" : 1,\n    "text_embedding_PCA_dim" : 10,\n    "img_embedding_PCA_dim" : 10\n}\n\npreprocessed_dict = create_preprocessed_dict(all_params)\npreprocessed_data = create_preprocessed_from_dict(preprocessed_dict, params)\npreprocessed_data_2 = create_preprocessed(params)\n'

In [98]:
# preprocessed_data.head()

In [99]:
# preprocessed_data_2.head()

# Defining our loss function


In [100]:
# Score computation : Root Mean Square Error

def compute_rmse(predict, target):
    return -mean_squared_error(predict, target, squared=False)

def compute_rmse2(predict, target):
    if len(target.shape) == 2:
        target = target.squeeze()
    if len(predict.shape) == 2:
        predict = predict.squeeze()
    diff = target - predict
    if len(diff.shape) == 1:
        diff = np.expand_dims(diff, axis=-1)
    rmse = np.sqrt(diff.T@diff / diff.shape[0])
    return -float(rmse)

custom_scorer = make_scorer(compute_rmse)

# Optimizing the model parameters

## Our homemade mixed localsearch + randomizedsearch algorithm for the preprocessing parameters

The method we used to try getting the best regressor is to use a Localsearch algorithm parsing among the above defined preprocessing parameters. For each possible preprocessing, the algorithm performs a RandomizedSearch 5-fold cross validation to select the best parameters of the given regressor. The possible values for each preprocessing and regressor parameter has to be given as imput to the algorithm.

NB : You might want to reduce the thresholds for $\texttt{no_reduce}$ in the $\texttt{compute}$ function if performing the gridsearch on a time consuming regressor to fit such as a MLP for example (by choosing a small value of $\texttt{max_no_upgrade}$).

In [101]:
class Solution:
    def __init__(self, params, grid_params, score, RSCV):
        self.grid_params = grid_params
        self.params = params
        self.score = score
        self.RSCV = RSCV

class LocalSearch :
    
    def __init__(self, regressor, params, grid_params, max_no_upgrade):
        self.curr_score = []
        self.best_score = []
        self.regressor = regressor
        self.params = params
        self.preprocessed_dict = create_preprocessed_dict(params)
        self.grid_params = grid_params
        self.visited = []
        self.max_no_upgrade = max_no_upgrade

    
    def clean(self, params):
        copied = copy.deepcopy(params)
        if copied["production_year_style"] == "no_period":
            copied.pop("n_year_period")
        if copied["studio_use_PCA"] == False:
            copied.pop("studio_PCA_dim")
        return copied
    
    def is_visited(self, params):
        return str(self.clean(params)) in self.visited
    
    def visit(self, params):
        self.visited.append(str(self.clean(params)))
    
    def get_solution(self, params):
        preprocessed_data = copy.deepcopy(create_preprocessed_from_dict(self.preprocessed_dict, params))
        RSCV = RandomizedSearchCV(regressor, self.grid_params, cv=5, scoring=custom_scorer, n_jobs=-1, random_state=0)
        RSCV.fit(preprocessed_data, Y1)
        current_score = np.mean(RSCV.best_score_)
        self.visit(params)
        return Solution(params, RSCV.best_params_, current_score, RSCV)
    
    def initiate(self):
        current_params = {}
        for param in self.params.keys():
            current_params[param] = np.random.choice(self.params[param])
        self.current_solution = self.get_solution(current_params)
        self.best_solution = copy.deepcopy(self.current_solution)
    
    def transitions(self):
        neighbors = []
        current_params = self.current_solution.params
        for param in self.params.keys():
            copied = copy.deepcopy(self.params[param])
            np.random.shuffle(copied)
            for value in copied:
                new_params = copy.deepcopy(current_params)
                new_params[param] = value
                if not self.is_visited(new_params):
                    neighbors.append(new_params)
                    break
        return neighbors
    
    def choose(self, neighbors, n):
        if len(neighbors) <= n :
            return neighbors
        return np.random.choice(neighbors, size=n, replace = False)
    
    def get_difference(self, other):
        first = self.current_solution.params
        for param in first.keys():
            if first[param] != other[param]:
                return param, first[param], other[param]
        return None
    
    def compute(self):
        self.initiate()
        print("INITIALISATION ENDED : initial score of {:.2e} $".format(self.best_solution.score))
        no_upgrade = 0;
        while no_upgrade <= self.max_no_upgrade:
            neighbors = self.transitions()
            if len(neighbors) == 0:
                break
            neighbors = self.choose(neighbors, 5)
            curr_best_sol = Solution(None, None, -float("inf"), None)
            for neighbor in neighbors :
                solution = self.get_solution(neighbor)
                if solution.score > curr_best_sol.score :
                    curr_best_sol = solution
                diff = self.get_difference(neighbor)
                # print("{} : {} -> {} score {:.2%}".format(diff[0], diff[1], diff[2],solution.score))
            self.current_solution = copy.deepcopy(curr_best_sol)
            # print("updated current")
            if curr_best_sol.score > self.best_solution.score:
                self.best_solution = copy.deepcopy(curr_best_sol)
                print("SOLUTION UPGRADED : new score of {:.2e} $".format(self.best_solution.score))
                no_upgrade = 0

            self.curr_score.append(self.current_solution.score)
            self.best_score.append(self.best_solution.score)
            no_upgrade += 1

## Some nice printing and plotting functions for the results

In [102]:
def print_solution(regressor, solution):
    print(f"The {regressor} Regressor reaches mean RMSE of {-solution.score:.4e} $.")
    var = solution.RSCV.cv_results_['std_test_score'][solution.RSCV.best_index_]
    print(f"This RMSE has a variance of {var:.4e}")
    data = [["PARAMETER", "VALUE"]] + [[key, solution.params[key]] for key in solution.params.keys()]
    col_widths = [max(len(str(row[i])) for row in data) for i in range(len(data[0]))]
    print("\nPreprocessing :")
    for row in data:
        print(' '.join(str(cell).ljust(col_widths[i]) for i, cell in enumerate(row)))

    data = [["PARAMETER", "VALUE"]] + [[key, solution.grid_params[key]] for key in solution.grid_params.keys()]
    col_widths = [max(len(str(row[i])) for row in data) for i in range(len(data[0]))]
    print("\nRegressor :")
    for row in data:
        print(' '.join(str(cell).ljust(col_widths[i]) for i, cell in enumerate(row)))

In [103]:
def print_evolution(LS, name):
    current_scores = np.array(LS.curr_score)
    best_scores = np.array(LS.best_score)
    iter = np.arange(1, len(current_scores)+1)
    plt.plot(iter, -current_scores, label="current solution", color="blue", linestyle="--")
    plt.plot(iter, -best_scores, label="best solution", color="red")
    plt.title("Local search score evolution", fontsize=20)
    plt.ylabel("avg RMSE on 5-fold")
    plt.xlabel("iter")
    plt.legend()
    plt.grid()
    plt.savefig(f"LS{name}.png")
    plt.show()

def print_last_RSCV_evolution(solution, name):
    RSCV = solution.RSCV
    RMSE = RSCV.cv_results_['mean_test_score']
    # Plot the scores as a function of the iteration
    plt.plot(range(1, len(RMSE)+1), -RMSE)
    plt.xlabel('iter')
    plt.ylabel('RMSE')
    plt.title("Randomized search score evolution")
    plt.grid()
    plt.savefig(f"RSCV{name}.png")
    plt.show()

## Choosing our preprocessing parameters possibilities for the optimization

In [104]:
preprocessing_params = {
    "production_year_style" : ["per_quantile", "per_period_length", "no_period"], # "per_quantile" / "per_period_length" / "no_period"
    "n_year_period" : [3, 5, 10],
    "runtime_replace_type" : ["mean", "zero", "median"], # "mean" / "zero" / "median"
    "studio_use_PCA" : [False],
    "studio_PCA_dim" : [10], # this parameter has no use since we finally do not use PCA on the studio feature
    "text_embedding_PCA_dim" : [1, 10, 20, 50, 100],
    "img_embedding_PCA_dim" : [1, 10, 20, 50, 100]
}

## Optimizing an OLS Regressor

In [None]:
regressor_params = {
    "fit_intercept": [False] # Set any parameter for the RandomizedSearch to work even if no preprocessing parameter is important.
}

max_no_upgrade = 20
regressor = LinearRegression()
LSOLS = LocalSearch(regressor, preprocessing_params, regressor_params, max_no_upgrade)
LSOLS.compute()

INITIALISATION ENDED : initial score of -5.76e+07 $
SOLUTION UPGRADED : new score of -5.67e+07 $


In [None]:
solution = LSOLS.best_solution
print_solution("OLS", solution)

In [None]:
print_evolution(LSOLS, "OLS")
# print_last_RSCV_evolution(solution, "OLS") # this plot is useless since no parameter optimisation has to be done for OLS

## Optimizing a Ridge Regressor

In [None]:
regressor_params = {
    "alpha": [1e-1, 5e-1, 1, 5, 10, 20]
}

regressor = Ridge()
max_no_upgrade = 20
LSRidge = LocalSearch(regressor, preprocessing_params, regressor_params, max_no_upgrade)
LSRidge.compute()

In [None]:
solution = LSRidge.best_solution
print(solution.RSCV.best_score_)
print_solution("Ridge", solution)

In [None]:
print_evolution(LSRidge, "Ridge")
print_last_RSCV_evolution(solution, "Ridge")

## Optimizing a KNN regressor

In [None]:
regressor_params = {
    'n_neighbors' : [2, 5, 10, 20, 50], 
    'weights' : ['uniform', 'distance']
}

max_no_upgrade = 20
regressor = KNeighborsRegressor()
LSKNN = LocalSearch(regressor, preprocessing_params, regressor_params, max_no_upgrade)
LSKNN.compute()

In [None]:
solution = LSKNN.best_solution
print_solution("KNN", solution)

In [None]:
print_evolution(LSKNN)
print_last_RSCV_evolution(solution, "KNN")

## Optimizing a MLP regressor

In [None]:
regressor_params =    {
    'hidden_layer_sizes' : [(50,50), (100,)],
    'activation' : ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'max_iter': [500]
}

max_no_upgrade = 10
regressor = MLPRegressor(random_state=0)
LSMLP = LocalSearch(regressor, preprocessing_params, regressor_params, max_no_upgrade)
LSMLP.compute()

In [None]:
solution = LSMLP.best_solution
print_solution("MLP", solution)

In [None]:
print_evolution(LSMLP, "MLP")
print_last_RSCV_evolution(solution, "MLP")

## Optimizing a Random Forest regressor

In [None]:
regressor_params = {
    "n_estimators": [50, 100, 200], 
    "criterion": ["squared_error"], # ["squared_error", "absolute_error", "poisson"],
    "min_samples_split": [1.0, 3],     # [1, 2, 3],
    "max_features" : ["sqrt", "log2"],      # ["sqrt", "log2", None]
}

max_no_upgrade = 10
regressor = RandomForestRegressor(n_jobs=-1,random_state=0)
LSRF = LocalSearch(regressor, preprocessing_params, regressor_params, max_no_upgrade)
LSRF.compute()

In [None]:
solution = LSRF.best_solution
print_solution("Random Forest", solution)

In [None]:
print_evolution(LSRF, "RF")
print_last_RSCV_evolution(solution, "RF")

## Our own MLP implementation with torch

In [None]:
preprocessing_params = LSOLS.best_solution.params

X_train, X_test, y_train, y_test = train_test_split(create_preprocessed(preprocessing_params), Y1, test_size=0.2)

In [None]:
import torch
input_size = len(X_train.columns)
output_size = 1

num_hidden_layers = 2
hidden_layer_size = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = torch.nn.Sequential(
    torch.nn.Linear(input_size, hidden_layer_size),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_layer_size, output_size)
).to(device)
model = model.double()
loss_fn = torch.nn.MSELoss()

lr = 1e-3
betas = (0.9, 0.999)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas)

num_epochs = 10

inputs = X_train.to_numpy()
targets = y_train.to_numpy()
inputs = torch.from_numpy(inputs).to(device).double()
targets = torch.from_numpy(targets).to(device).double()


for epoch in range(num_epochs):
    train_loss = []
    for i in range(0, len(inputs)):
        batch_inputs = inputs[i]
        batch_targets = targets[i]
        outputs = model.forward(batch_inputs)
        loss = loss_fn(outputs, batch_targets)
        train_loss.append(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the average train loss for the current epoch
    print(f"Epoch {epoch+1}: Train Loss = {torch.sqrt(sum(train_loss)/len(train_loss))}")

In [None]:
inputs_test = X_test.to_numpy()
inputs_test = torch.from_numpy(inputs_test).to(device).double()
targets_test = y_test.to_numpy()
targets_test = torch.from_numpy(targets_test).to(device).double()

outputs_test = model.forward(inputs_test)
outputs_test = outputs_test.cpu().detach().numpy()
print("{:.2e} is the final error".format(compute_rmse(outputs_test, y_test)))