##Modelo de predicción del Revenue de punta a punta

Abrimos el csv e importamos algunas librerias

In [155]:
from google.colab import files
uploaded = files.upload()

Saving movies_metadata.csv to movies_metadata (3).csv


In [156]:
import pandas as pd 
import numpy as np
import io

In [157]:
df = pd.read_csv(io.BytesIO(uploaded['movies_metadata.csv']))

  interactivity=interactivity, compiler=compiler, result=result)


###Pre-processing

####Revenue:
1. Quitamos valores menores o iguales a 0.
2. Filtramos el 5% de los valores mas bajos para evitar errores de tipeo.
3. Normalizamos aplicando el logaritmo.

In [158]:
df = df[df.revenue > 0]
df = df[df.revenue > df.revenue.quantile(.05)]

import math
def revenue_log(x):
  return math.log(x)

df['revenue'] = df.revenue.apply(lambda x: revenue_log(x))

####Runtime:
1. Sacamos valores nulos
2. Sacamos valores que esten a mas o menos de 2 desviaciones estandar de la media.

In [159]:
df.fillna(0, inplace= True)
df = df[df.runtime != 0]
upper_lim = np.mean(df.runtime) + np.std(df.runtime)*2
lower_lim = np.mean(df.runtime) - np.std(df.runtime)*2

df = df[df.runtime > lower_lim]
df = df[df.runtime < upper_lim]

####Release Date:
1. Convertimos a tipo de dato datetime.
2. Nos quedamos solo con el año.
3. Filtramos todas aquellas peliculas anteriores al 1985

In [160]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = pd.DatetimeIndex(df['release_date']).year
df = df[df.year > 1985]
df.drop(columns = ['release_date'], inplace = True)

#### Géneros, Países y Colecciones:
1. Convertimos los diccionarios a listas, quedandonos solo con el valor que nos interesa (name).
2. En el caso de colecciones dado que hay >700 collections con frecuencias bajas la convertimos en un tipo de dato binario. 1 si es parte de una coleccion, 0 caso contrario.

In [161]:
#Desarmo diccionarios y convierto en listas

#Géneros
df['genres'] = df.genres.apply(lambda x: [i['name']  for i in eval(x)])

#Países
df['production_countries'] = df.production_countries.apply(lambda x: [i['name']  for i in eval(x)])

#Colecciones
df['belongs_to_collection'].fillna(0, inplace = True)
import ast
def collection_from_dict(x):
  if x != 0:
   #convert string to dictionary
    res = ast.literal_eval(x) 
    #return just the name of the collection
    return 1
  return 0
df['belongs_to_collection'] = df.belongs_to_collection.apply(lambda x: collection_from_dict(x))


###Feature Selection:
1. Elegimos que variables dropear del modelo.

In [162]:
df=df.drop(columns = ['adult','budget','homepage','id','original_title',
                     'overview','popularity','poster_path','production_companies',
                     'spoken_languages','status', 'tagline', 'title', 'video',
                     'vote_average','vote_count'], axis=1)


###Train and test split:
1. Cortamos por año (2014) dado que queremos que el modelo aprenda a predecir películas del futuro.

In [163]:
train_df = df[df.year <= 2014]
test_df = df[df.year > 2014]
len(train_df), len(test_df), len(test_df) / len(train_df)

(5096, 647, 0.12696232339089483)

In [164]:
y_train = (train_df.revenue).values
y_test = (test_df.revenue).values

###Feature Engineering

Vamos a usar todo como listas de diccionarios para poder usar el ecosistema de sklearn de forma sencilla

In [165]:
train_docs = train_df.to_dict(orient='records')
test_docs = test_df.to_dict(orient='records')

In [166]:
train_docs[0], test_docs[0]

({'belongs_to_collection': 1,
  'genres': ['Animation', 'Comedy', 'Family'],
  'imdb_id': 'tt0114709',
  'original_language': 'en',
  'production_countries': ['United States of America'],
  'revenue': 19.7385732187406,
  'runtime': 81.0,
  'year': 1995},
 {'belongs_to_collection': 1,
  'genres': ['Action', 'Adventure', 'Thriller'],
  'imdb_id': 'tt2381249',
  'original_language': 'en',
  'production_countries': ['China', 'United States of America'],
  'revenue': 20.341024173461395,
  'runtime': 131.0,
  'year': 2015})

####Feature Géneros:
1. Hacemos variables dummies.

In [167]:
from sklearn.base import BaseEstimator, TransformerMixin

class GenreDummies(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({g: 1 for g in e['genres']})
        return res

####Features Belongs to Collection y Language:
1. Hacemos una clase que las devuelva como estan.

In [168]:
from sklearn.base import BaseEstimator, TransformerMixin

class BelongsToCollection(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({'belongs to collection': e['belongs_to_collection']})
        return res

In [169]:
class Language(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({'language': e['original_language']})
        return res

####Feature Runtime:
1. Hacemos una clase que la devuelva tal cual está pero luego la normalizamos en el pipeline con StandardScaler().

In [170]:
class Runtime(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({'runtime': e['runtime']})
        return res

####Feature años:
1. Armamos una variable que indique hace cuantos años se filmo la pelicula.

In [171]:
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin

class YearsAgo(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.now = datetime.now().year
        
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({'years_ago': self.now - int(e['year'])})
        return res

####Feature países:
1. Armamos variables dummies.

In [172]:
class CountryDummies(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({g: 1 for g in e['production_countries']})
        return res

###Performance metrics:
1. Medimos el mean absolute percentage error (mape) y el r2 para el train y test set.

In [209]:
def test_pipe(pipe):
    return {
        'train_mape' : round(np.mean(np.abs(percentage_error(np.asarray(y_train), np.asarray(pipe.predict(train_docs))))) * 100,2),
        'test_mape' : round(np.mean(np.abs(percentage_error(np.asarray(y_test), np.asarray(pipe.predict(test_docs))))) * 100,2)
    }

def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

###Armamos el Pipeline

In [174]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

#####Regresión Lineal

In [223]:
pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))      
    ),
    StandardScaler(),
    LinearRegression()
)

pipe.fit(train_docs, y_train);
lin_reg = test_pipe(pipe)
lin_reg

{'test_mape': 10.66, 'train_mape': 8.99}

#####Ridge Regularization

In [184]:
from sklearn.linear_model import Ridge
alphas = [0.01, 0.1, 1, 10, 100, 500, 1000]
for alpha in alphas:
  pipe = make_pipeline(
      make_union(
          make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
          make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
          make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
          make_pipeline(Runtime(), DictVectorizer(sparse=False)),
          make_pipeline(Language(), DictVectorizer(sparse=False)),
          make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))         
      ),
      StandardScaler(),
      Ridge(alpha=alpha)
  )
  pipe.fit(train_docs, y_train);
  print('alpha:', alpha)
  print(test_pipe(pipe))

alpha: 0.01
{'train_r2': 0.4, 'train_mape': 8.95, 'test_r2': 0.4, 'test_mape': 10.67}
alpha: 0.1
{'train_r2': 0.4, 'train_mape': 8.95, 'test_r2': 0.4, 'test_mape': 10.67}
alpha: 1
{'train_r2': 0.4, 'train_mape': 8.95, 'test_r2': 0.4, 'test_mape': 10.67}
alpha: 10
{'train_r2': 0.4, 'train_mape': 8.95, 'test_r2': 0.4, 'test_mape': 10.67}
alpha: 100
{'train_r2': 0.4, 'train_mape': 8.96, 'test_r2': 0.4, 'test_mape': 10.67}
alpha: 500
{'train_r2': 0.4, 'train_mape': 9.03, 'test_r2': 0.4, 'test_mape': 10.71}
alpha: 1000
{'train_r2': 0.39, 'train_mape': 9.13, 'test_r2': 0.4, 'test_mape': 10.82}


No hay mucho efecto de regularizarlo con Ridge en el conjunto de test

#####Red Neuronal

In [179]:
from sklearn.neural_network import MLPRegressor

In [196]:
pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))      
    ),
    StandardScaler(),
    MLPRegressor(random_state=21, solver = 'lbfgs')
)

pipe.fit(train_docs, y_train);
test_pipe(pipe)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


{'test_mape': 13.76, 'test_r2': -0.23, 'train_mape': 5.58, 'train_r2': 0.71}

###### Tuneo de hiperparámetros:
Con solver *lbfgs*:
1. Busco óptimo nivel de alpha para regularizar.
2. Busco óptima cantidad de iteraciones (max_iter).

Hiperparámetro alpha:

In [None]:
alphas = (168, 169, 170, 171, 172)
for alpha in alphas:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=alpha, random_state=21, max_iter=200, 
                   early_stopping=True, solver = 'lbfgs')
    )
  pipe.fit(train_docs, y_train);
  print('alpha:',alpha)
  print(test_pipe(pipe))

Hiperparámetro max_iter:

In [None]:
iters = (100, 200, 300, 400, 500)
for iter in iters:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=170, random_state=21, max_iter=iter, 
                   early_stopping=True, solver = 'lbfgs')
    )
  pipe.fit(train_docs, y_train);
  print('iters:' ,iter)
  print(test_pipe(pipe))

Mejor resultado con alpha = 170 y max_iter = 300

In [224]:
pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=170, random_state=21, max_iter=300, 
                   early_stopping=True, solver = 'lbfgs')
    )
pipe.fit(train_docs, y_train);
red_neu_lbfgs = test_pipe(pipe)
print(red_neu_lbfgs)

{'train_mape': 7.92, 'test_mape': 10.27}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


###### Tuneo de hiperparámetros:
Con solver *sgd*:
1. Busco óptimo nivel de alpha para regularizar.
2. Busco óptimo nivel de learning rate.

In [197]:
pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))      
    ),
    StandardScaler(),
    MLPRegressor(random_state=21, solver = 'sgd')
)

pipe.fit(train_docs, y_train);
test_pipe(pipe)



{'test_mape': 10.61, 'test_r2': 0.37, 'train_mape': 7.33, 'train_r2': 0.56}

Hiperparámetro alpha:

In [None]:
alphas = (14, 15, 16, 17, 18, 19, 20, 21, 22, 25)
for alpha in alphas:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=alpha, random_state=21,
                   early_stopping=True, solver = 'sgd')
    )
  pipe.fit(train_docs, y_train);
  print('alpha:',alpha)
  print(test_pipe(pipe))

Hiperparámetro learning rate:

In [203]:
lrates = (0.00001, 0.0001, 0.001)
for lr in lrates:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=19, random_state=21,
                   early_stopping=True, solver = 'sgd',
                   learning_rate_init = lr,
                   max_iter = 2000)
    )
  pipe.fit(train_docs, y_train);
  print('Learning rate:' , lr)
  print(test_pipe(pipe))



Learning rate: 1e-05
{'train_r2': 0.35, 'train_mape': 8.7, 'test_r2': 0.4, 'test_mape': 10.62}
Learning rate: 0.0001
{'train_r2': 0.44, 'train_mape': 8.58, 'test_r2': 0.41, 'test_mape': 10.51}
Learning rate: 0.001
{'train_r2': 0.44, 'train_mape': 8.57, 'test_r2': 0.41, 'test_mape': 10.49}


In [204]:
lrates = (0.0005, 0.0007, 0.001, 0.002, 0.003, 0.004, 0.005)
for lr in lrates:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=19, random_state=21,
                   early_stopping=True, solver = 'sgd',
                   learning_rate_init = lr,
                   max_iter = 2000)
    )
  pipe.fit(train_docs, y_train);
  print('Learning rate:' , lr)
  print(test_pipe(pipe))

Learning rate: 0.0005
{'train_r2': 0.44, 'train_mape': 8.57, 'test_r2': 0.41, 'test_mape': 10.51}
Learning rate: 0.0007
{'train_r2': 0.44, 'train_mape': 8.58, 'test_r2': 0.41, 'test_mape': 10.5}
Learning rate: 0.001
{'train_r2': 0.44, 'train_mape': 8.57, 'test_r2': 0.41, 'test_mape': 10.49}
Learning rate: 0.002
{'train_r2': 0.44, 'train_mape': 8.62, 'test_r2': 0.4, 'test_mape': 10.64}
Learning rate: 0.003
{'train_r2': 0.43, 'train_mape': 8.72, 'test_r2': 0.39, 'test_mape': 10.78}
Learning rate: 0.004
{'train_r2': 0.42, 'train_mape': 8.79, 'test_r2': 0.4, 'test_mape': 10.62}
Learning rate: 0.005
{'train_r2': 0.42, 'train_mape': 8.74, 'test_r2': 0.4, 'test_mape': 10.63}


Mejor resultado con alpha = 19 y learning rate = 0.001

In [225]:
pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=19, random_state=21, max_iter=500, 
                   early_stopping=True, solver = 'sgd', learning_rate_init=0.001)
    )
pipe.fit(train_docs, y_train);
red_neu_sgd = test_pipe(pipe)
print(red_neu_sgd)

{'train_mape': 8.57, 'test_mape': 10.49}


###### Tuneo de hiperparámetros:
Con solver *Adam*:
1. Busco óptimo nivel de alpha para regularizar.
2. Busco óptimo nivel de learning rate.

Hiperparámetro alpha:

In [None]:
alphas = (16, 17, 18, 19, 20, 21, 22, 23, 24)
for alpha in alphas:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=alpha, random_state=21,
                   early_stopping=True, solver = 'adam')
    )
  pipe.fit(train_docs, y_train);
  print('alpha:',alpha)
  print(test_pipe(pipe))

Hiperparámetro learning rate:

In [213]:
lrates = (0.0005, 0.0008, 0.001, 0.0015, 0.002, 0.003, 0.004)
for lr in lrates:
  pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=23, random_state=21,
                   early_stopping=True, solver = 'adam',
                   learning_rate_init = lr,
                   max_iter = 500)
    )
  pipe.fit(train_docs, y_train);
  print('Learning rate:' , lr)
  print(test_pipe(pipe))

Learning rate: 0.0005
{'train_mape': 8.54, 'test_mape': 10.54}
Learning rate: 0.0008
{'train_mape': 8.57, 'test_mape': 10.56}
Learning rate: 0.001
{'train_mape': 8.62, 'test_mape': 10.44}
Learning rate: 0.0015
{'train_mape': 8.75, 'test_mape': 10.57}
Learning rate: 0.002
{'train_mape': 8.97, 'test_mape': 10.67}
Learning rate: 0.003
{'train_mape': 9.11, 'test_mape': 10.27}
Learning rate: 0.004
{'train_mape': 9.11, 'test_mape': 10.9}


Mejor resultado con alpha = 23 y learning rate = 0.001

In [226]:
pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=23, random_state=21, 
                   early_stopping=True, solver = 'adam',
                   learning_rate_init=0.001)
    )
pipe.fit(train_docs, y_train);
red_neu_adam = test_pipe(pipe)
print(red_neu_adam)

{'train_mape': 8.62, 'test_mape': 10.44}


####Winner Model

In [227]:
lin_reg, red_neu_adam, red_neu_lbfgs, red_neu_sgd

({'test_mape': 10.66, 'train_mape': 8.99},
 {'test_mape': 10.44, 'train_mape': 8.62},
 {'test_mape': 10.27, 'train_mape': 7.92},
 {'test_mape': 10.49, 'train_mape': 8.57})

In [229]:
lin_reg['test_mape']-lin_reg['train_mape']

1.67

In [234]:
models = {
    'models': ['linear regression', 'red neuronal lbfgs', 'red neuronal sgd', 'red neuronal adam'],
    'test mape': [lin_reg['test_mape'], red_neu_lbfgs['test_mape'], red_neu_sgd['test_mape'], red_neu_adam['test_mape']],
    'overfitting':[lin_reg['test_mape']-lin_reg['train_mape'], red_neu_lbfgs['test_mape']-red_neu_lbfgs['train_mape'],
                   red_neu_sgd['test_mape']-red_neu_sgd['train_mape'], red_neu_adam['test_mape']-red_neu_adam['train_mape']]
          }

results = pd.DataFrame(models)
results

Unnamed: 0,models,test mape,overfitting
0,linear regression,10.66,1.67
1,red neuronal lbfgs,10.27,2.35
2,red neuronal sgd,10.49,1.92
3,red neuronal adam,10.44,1.82


In [235]:
results.sort_values(by = 'test mape', ascending=True)

Unnamed: 0,models,test mape,overfitting
1,red neuronal lbfgs,10.27,2.35
3,red neuronal adam,10.44,1.82
2,red neuronal sgd,10.49,1.92
0,linear regression,10.66,1.67


In [237]:
results.sort_values(by = 'overfitting', ascending=True)

Unnamed: 0,models,test mape,overfitting
0,linear regression,10.66,1.67
3,red neuronal adam,10.44,1.82
2,red neuronal sgd,10.49,1.92
1,red neuronal lbfgs,10.27,2.35


Elijo la Red Neuronal adam dado que muestra el mejor balance entre el test mape y el nivel de overfitting.

In [238]:
pipe = make_pipeline(
      make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False)),
        make_pipeline(Language(), DictVectorizer(sparse=False)),
        make_pipeline(BelongsToCollection(), DictVectorizer(sparse=False))        
    ),
      StandardScaler(),
      MLPRegressor(alpha=23, random_state=21, 
                   early_stopping=True, solver = 'adam',
                   learning_rate_init=0.001)
    )
pipe.fit(train_docs, y_train);
red_neu_adam = test_pipe(pipe)
print(red_neu_adam)

{'train_mape': 8.62, 'test_mape': 10.44}
