## Import datas

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

In [2]:
# Use pandas to loadinto a DataFrame
# Y1.csv doesn’t have a header so
# add one when loading the file
X1 = pd.read_csv("data/X1.csv")
Y1 = pd.read_csv("data/Y1.csv", header=None, names=['revenue'])


# ENLEVER colonne "Unnamed" du dataset : utilité ? -> Pas listé dans les features du pdf
X = X1.drop("Unnamed: 0", axis=1)

# Creer un data training/validation splités a partir du X1 (on garde X2 pour les vrais tests) 
x_train, x_test, y_train, y_test = train_test_split(X, Y1, test_size = 0.2, random_state=1)

X.head()

Unnamed: 0,title,img_url,description,ratings,n_votes,is_adult,production_year,runtime,genres,release_year,studio,img_embeddings,text_embeddings
0,Doc Hollywood,https://m.media-amazon.com/images/M/MV5BNTcwMj...,Doc Hollywood: Directed by Michael Caton-Jones...,6.2,33519.0,0,1991,104,"Comedy,Drama,Romance",1991.0,WB,"[0.47755364, 0.8723433, 1.2153144, 0.3793078, ...","[-0.6110763, 0.33045605, 0.99949366, -0.975092..."
1,Ballets Russes,https://m.media-amazon.com/images/M/MV5BMTIyND...,"Ballets Russes: Directed by Daniel Geller, Day...",7.7,647.0,0,2005,118,"Documentary,History,Music",2005.0,Zeit.,"[0.26525393, 2.4802532, 0.84571683, 0.21649377...","[-0.7825726, 0.43968397, 0.99991065, -0.990713..."
2,Cirque du Freak: The Vampire's Assistant,https://m.media-amazon.com/images/M/MV5BMTI1ND...,Cirque du Freak: The Vampire's Assistant: Dire...,5.8,43497.0,0,2009,119,"Action,Adventure,Fantasy",2009.0,Uni.,"[0.06563655, 1.2579643, 0.25442713, 0.3307045,...","[-0.5942408, 0.25803727, 0.999134, -0.9643456,..."
3,Deadfall,https://m.media-amazon.com/images/M/MV5BMTg2OD...,Deadfall: Directed by Jerry Hopper. With Van J...,7.7,16.0,0,1959,30,Western,1993.0,Trim.,"[0.23510928, 4.0279293, 0.6071904, 0.2641873, ...","[-0.6360042, 0.3744305, 0.99964553, -0.9814805..."
4,Nostradamus,https://m.media-amazon.com/images/M/MV5BZGJkZD...,"Nostradamus: Directed by Richard Ross, John Ti...",5.7,7.0,0,2003,\N,"Documentary,History,Mystery",1994.0,OrionC,"[0.3245862, 1.5333436, 0.24276002, 0.055327218...","[-0.74996245, 0.42981246, 0.9998112, -0.990223..."


### Compute the Root Mean Square Error

In [3]:
def compute_rmse(predict, target):
    if len(target.shape) == 2:
        target = target.squeeze()
    if len(predict.shape) == 2:
        predict = predict.squeeze()
    diff = target - predict
    if len(diff.shape) == 1:
        diff = np.expand_dims(diff, axis=-1)
    rmse = np.sqrt(diff.T@diff / diff.shape[0])
    return float(rmse)

## Preprocessing the data

In [4]:
"""
Creating empty DataFrame to start
"""
n_samples = X.shape[0]
data = pd.DataFrame()

In [5]:
"""
Keeping the directly usable features
"""

directly_usable_features = ["ratings", "n_votes", "is_adult"]
for feature in directly_usable_features:
    data[feature] = X[feature]

In [6]:
"""
Dealing with the "production_year" feature
"""

n_year_period = 5 # n_year_period categorical features for each computed period
style = "per_quantile"
# style = "per_period_length"
# style = "no_period"


# Removing previously computed categorie(s) for the "production_year" initial feature

for feature in data.columns:
    if len(feature) >= 8 and (feature[:6] == "period" or feature == "production_year"):
        data.drop(feature, axis=1, inplace=True)

# Creating new categorie(s) for the "production_year" initial feature

prod_year = X["production_year"].copy()
if style == "per_quantile" or style == "per_period_length":
    categories = np.ones((n_year_period, n_samples))
    
    if style == "per_quantile":
        thresholds = prod_year.quantile(np.arange(1, n_year_period) / n_year_period)
    else :
        thresholds = np.min(prod_year) + (np.max(prod_year) - np.min(prod_year))*np.arange(1, n_year_period)/n_year_period
    for i, threshold in enumerate(thresholds):
        categories[i+1] = (prod_year >= threshold).astype(int)
        categories[i] -= categories[i+1]
    for period in range(n_year_period):
        data["period {}".format(period)] = categories[period]
elif style == "no_period":
    data["production_year"] = prod_year

In [7]:
"""
Dealing with the "runtime" feature

The problem is here that we have some missing values, we have to deal with it.
"""

# Add other smarter ways ?

# replace_type = "zero"
replace_type = "mean"

runtime = X["runtime"].copy()
if replace_type == "zero":
    runtime[runtime == "\\N"] = 0
if replace_type == "mean":
    mean = np.mean(runtime[runtime != "\\N"].astype(float))
    runtime[runtime == "\\N"] = mean
data["runtime"] = runtime.astype(float)    

In [8]:
"""
Dealing with the "studio" feature


Juste rajouter toute les features une par une me semblait un peu lourd (yen a 509), ducoup j'effectue PCA dessus.
Jsp si ça se fait ? (on peut changer l'algo de dimensionality reduction aussi si on veut)
"""

use_PCA = True
dim = 20 # output dimension of vectors created by PCA if PCA is used


# Removing previously computed categorie(s) for the "studio" initial feature
for feature in data.columns:
    if len(feature) >= 10 and feature[:10] == "studio_PC_":
        data.drop(feature, axis=1, inplace=True)

# Creating new categorie(s) for the "studio" initial feature
studio = X["studio"].copy()
studio_labels = np.unique(studio)
studio_features = np.zeros((len(studio_labels), n_samples))
for i, label in enumerate(studio_labels) :
    studio_features[i] = (studio == label).astype(int)

# Applying pca or not
if use_PCA :
    pca = PCA(n_components=dim)
    out = pca.fit_transform(studio_features.T)
else :
    out = studio_features.T
    
data[["studio_PC_{}".format(i) for i in range(dim)]] = out

# Ya plein de warnings quand dim trop grand ou pas de PCA /: 
# jsp comment regler ça... en utilisant pd.concat ça tourne vraiment extrêmement lentement

In [9]:
"""
Dealing with the "genres" feature

Je rajoute juste une feature par genre, j'espère ça suffit ? -> pt-être dimension reduction sur ça aussi ?
Certains films ont pas de genre attitré (genre = "\\N"),
ce que j'ai fait ici considère ça comme un genre à part entière, ptet on peut deal le truc autrement ?
"""

all_genres = X["genres"].copy()
diff_genres = []
for genres in np.unique(all_genres):
    for genre in genres.split(",") :
        if not genre in diff_genres :
            diff_genres.append(genre)
            
for genre in diff_genres:
    data[genre] = [1 if genre in genres.split(",") else 0 for genres in all_genres]

In [10]:
"""
Dealing with the "text_embeddings" feature


Dimension of embedding space is too high -> dimensionnality reduction
J'utilise que PCA pr l'instant ici aussi
"""

output_dim = 20 # output dimension of PCA


# Removing previously computed categorie(s) for the "text_embedding" initial feature
for feature in data.columns:
    if len(feature) >= 18 and feature[:18] == "text_embedding_PC_":
        data.drop(feature, axis=1, inplace=True)

# Creating new categorie(s) for the "text_embedding" initial feature
text_embeddings = X["text_embeddings"]
input_dim = 768
embeddings = np.zeros((n_samples, input_dim))
for i, text_embedding in enumerate(text_embeddings):
    embeddings[i] = list(map(float,text_embedding[1:-1].split(",")))

# applying PCA
pca = PCA(n_components=output_dim)
output = pca.fit_transform(embeddings)

data[["text_embedding_PC_{}".format(i) for i in range(output_dim)]] = output

In [11]:
"""
Dealing with the "img_embeddings" feature


Dimension of embedding space is too high -> dimensionnality reduction
J'utilise que PCA pr l'instant ici aussi
"""

output_dim = 20 # output dimension of PCA


# Removing previously computed categorie(s) for the "img_embedding" initial feature
for feature in data.columns:
    if len(feature) >= 17 and feature[:17] == "img_embedding_PC_":
        data.drop(feature, axis=1, inplace=True)

# Creating new categorie(s) for the "img_embedding" initial feature
img_embeddings = X["img_embeddings"]
input_dim = 2048
embeddings = np.zeros((n_samples, input_dim))
for i, img_embedding in enumerate(img_embeddings):
    embeddings[i] = list(map(float,img_embedding[1:-1].split(",")))

# applying PCA
pca = PCA(n_components=output_dim)
output = pca.fit_transform(embeddings)

data[["img_embedding_PC_{}".format(i) for i in range(output_dim)]] = output

In [12]:
print(data)

      ratings   n_votes  is_adult  period 0  period 1  period 2  period 3  \
0         6.2   33519.0         0       0.0       1.0       0.0       0.0   
1         7.7     647.0         0       0.0       0.0       0.0       1.0   
2         5.8   43497.0         0       0.0       0.0       0.0       0.0   
3         7.7      16.0         0       1.0       0.0       0.0       0.0   
4         5.7       7.0         0       0.0       0.0       0.0       1.0   
...       ...       ...       ...       ...       ...       ...       ...   
4007      7.1  118129.0         0       0.0       0.0       0.0       0.0   
4008      5.1    1938.0         0       0.0       1.0       0.0       0.0   
4009      6.8   22004.0         0       0.0       0.0       0.0       0.0   
4010      4.5    2653.0         0       0.0       1.0       0.0       0.0   
4011      6.5   20789.0         0       0.0       0.0       0.0       0.0   

      period 4     runtime  studio_PC_0  ...  img_embedding_PC_10  \
0     