##Modelo de predicción del Revenue de punta a punta

Abrimos el csv e importamos algunas librerias

In [1]:
from google.colab import files
uploaded = files.upload()

Saving movies_metadata.csv to movies_metadata.csv


In [2]:
import pandas as pd 
import numpy as np
import io

In [3]:
df = pd.read_csv(io.BytesIO(uploaded['movies_metadata.csv']))

  interactivity=interactivity, compiler=compiler, result=result)


###Pre-processing

####Revenue:
1. Quitamos valores menores o iguales a 0.
2. Filtramos el 5% de los valores mas bajos para evitar errores de tipeo.
3. Normalizamos aplicando el logaritmo.

In [4]:
df = df[df.revenue > 0]
df = df[df.revenue > df.revenue.quantile(.05)]

import math
def revenue_log(x):
  return math.log(x)

df['revenue'] = df.revenue.apply(lambda x: revenue_log(x))

####Runtime:
1. Sacamos valores nulos
2. Sacamos valores que esten a mas o menos de 2 desviaciones estandar de la media.

In [5]:
df.fillna(0, inplace= True)
df = df[df.runtime != 0]
upper_lim = np.mean(df.runtime) + np.std(df.runtime)*2
lower_lim = np.mean(df.runtime) - np.std(df.runtime)*2

df = df[df.runtime > lower_lim]
df = df[df.runtime < upper_lim]

####Release Date:
1. Convertimos a tipo de dato datetime.
2. Nos quedamos solo con el año.
3. Filtramos todas aquellas peliculas anteriores al 1985

In [6]:
#Convierto la fecha a datetime y me quedo solo con el año
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = pd.DatetimeIndex(df['release_date']).year
df = df[df.year > 1985]
df.drop(columns = ['release_date'], inplace = True)

#### Géneros, Países y Colecciones:
1. Convertimos los diccionarios a listas, quedandonos solo con el valor que nos interesa (name).
2. En el caso de colecciones dado que hay >700 colecctions con frecuencias bajas la convertimos en un tipo de dato binario. 1 si es parte de una coleccion, 0 caso contrario.

In [7]:
#Desarmo diccionarios y convierto en listas

#Generos
df['genres'] = df.genres.apply(lambda x: [i['name']  for i in eval(x)])

#Paises
df['production_countries'] = df.production_countries.apply(lambda x: [i['name']  for i in eval(x)])

#Colecciones
df['belongs_to_collection'].fillna(0, inplace = True)
import ast
def collection_from_dict(x):
  if x != 0:
   #convert string to dictionary
    res = ast.literal_eval(x) 
    #return just the name of the collection
    return 1
  return 0
df['belongs_to_collection'] = df.belongs_to_collection.apply(lambda x: collection_from_dict(x))


###Feature Selection:
1. Elegimos que variables dropear del modelo.

In [8]:
df=df.drop(columns = ['adult','budget','homepage','id','original_title',
                     'overview','popularity','poster_path','production_companies',
                     'spoken_languages','status', 'tagline', 'title', 'video',
                     'vote_average','vote_count'], axis=1)


###Train and test split:
1. Cortamos por año (2014) dado que queremos que el modelo aprenda a predecir películas del futuro.

In [9]:
#Train and test split
train_df = df[df.year <= 2014]
test_df = df[df.year > 2014]
len(train_df), len(test_df), len(test_df) / len(train_df)

(5096, 647, 0.12696232339089483)

In [10]:
y_train = (train_df.revenue).values
y_test = (test_df.revenue).values

###Feature Engineering

Vamos a usar todo como listas de diccionarios para poder usar el ecosistema de sklearn de forma sencilla

In [11]:
train_docs = train_df.to_dict(orient='records')
test_docs = test_df.to_dict(orient='records')

In [12]:
train_docs[0], test_docs[0]

({'belongs_to_collection': 1,
  'genres': ['Animation', 'Comedy', 'Family'],
  'imdb_id': 'tt0114709',
  'original_language': 'en',
  'production_countries': ['United States of America'],
  'revenue': 19.7385732187406,
  'runtime': 81.0,
  'year': 1995},
 {'belongs_to_collection': 1,
  'genres': ['Action', 'Adventure', 'Thriller'],
  'imdb_id': 'tt2381249',
  'original_language': 'en',
  'production_countries': ['China', 'United States of America'],
  'revenue': 20.341024173461395,
  'runtime': 131.0,
  'year': 2015})

####Performance metrics:
1. Medimos el mean absolute percentage error (mape) y el r2 para el train y test set.

In [13]:
from sklearn.metrics import r2_score

def test_pipe(pipe):
    return {
        'train_r2': round(r2_score(y_train, pipe.predict(train_docs)),2),
        'train_mape' : round(np.mean(np.abs(percentage_error(np.asarray(y_train), np.asarray(pipe.predict(train_docs))))) * 100,2),
        'test_r2': round(r2_score(y_test, pipe.predict(test_docs)),2),
        'test_mape' : round(np.mean(np.abs(percentage_error(np.asarray(y_test), np.asarray(pipe.predict(test_docs))))) * 100,2)
    }

def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

####Feature Géneros:
1. Hacemos variables dummies
2. La agregamos al pipeline
3. Fiteamos y medimos el performance

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class GenreDummies(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({g: 1 for g in e['genres']})
        return res

In [15]:
GenreDummies().transform(train_docs[:10])

[{'Animation': 1, 'Comedy': 1, 'Family': 1},
 {'Adventure': 1, 'Family': 1, 'Fantasy': 1},
 {'Comedy': 1, 'Drama': 1, 'Romance': 1},
 {'Comedy': 1},
 {'Action': 1, 'Adventure': 1, 'Thriller': 1},
 {'Action': 1, 'Adventure': 1, 'Thriller': 1},
 {'Comedy': 1, 'Drama': 1, 'Romance': 1},
 {'Adventure': 1, 'Animation': 1, 'Family': 1},
 {'Action': 1, 'Adventure': 1},
 {'Drama': 1, 'Romance': 1}]

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

Armamos el pipeline, lo fiteamos y lo medimos

In [17]:
pipe = make_pipeline(
    GenreDummies(), DictVectorizer(sparse=False),  
    LinearRegression()
)

pipe.fit(train_docs, y_train);

In [18]:
pipe.predict(train_docs[:10])

array([17.44891585, 18.52694871, 15.96300799, 16.39435353, 17.82423906,
       17.82423906, 15.96300799, 18.15229955, 17.56251852, 15.86076097])

In [19]:
test_pipe(pipe)

{'test_mape': 12.84, 'test_r2': 0.2, 'train_mape': 10.96, 'train_r2': 0.17}

####Feature años:
1. Armamos una variable que indique hace cuantos años se filmo la pelicula.
2. La agregamos al pipeline, fiteamos y medimos el performance.

In [20]:
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin

class YearsAgo(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.now = datetime.now().year
        
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({'years_ago': self.now - int(e['year'])})
        return res

In [21]:
YearsAgo().transform(train_docs[150:160])

[{'years_ago': 26},
 {'years_ago': 27},
 {'years_ago': 27},
 {'years_ago': 27},
 {'years_ago': 27},
 {'years_ago': 29},
 {'years_ago': 27},
 {'years_ago': 27},
 {'years_ago': 27},
 {'years_ago': 27}]

In [22]:
from sklearn.pipeline import make_union

pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False))
    ),
    LinearRegression()
)

pipe.fit(train_docs, y_train);

In [23]:
test_pipe(pipe)

{'test_mape': 12.83, 'test_r2': 0.2, 'train_mape': 10.95, 'train_r2': 0.17}

####Feature países:
1. Armamos variables dummies.
2. La agregamos al pipeline, fiteamos y medimos el performance.

In [24]:
class CountryDummies(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({g: 1 for g in e['production_countries']})
        return res

In [25]:
CountryDummies().transform(train_docs[:10])

[{'United States of America': 1},
 {'United States of America': 1},
 {'United States of America': 1},
 {'United States of America': 1},
 {'United States of America': 1},
 {'United Kingdom': 1, 'United States of America': 1},
 {'United States of America': 1},
 {'United States of America': 1},
 {'France': 1, 'Germany': 1, 'Italy': 1, 'United States of America': 1},
 {'United Kingdom': 1, 'United States of America': 1}]

In [26]:
from sklearn.pipeline import make_union

pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False))
    ),
    LinearRegression()
)

pipe.fit(train_docs, y_train);

In [27]:
test_pipe(pipe)

{'test_mape': 11.87, 'test_r2': 0.29, 'train_mape': 10.15, 'train_r2': 0.26}

####Feature runtime:
1. Calculamos el runtime máximo.
2. Normalizamos la variable dividiendola por el runtime máximo.
3. La añadimos al pipeline, lo fiteamos y medimos el performance.

In [28]:
max_runtime = df.runtime.max()

class Runtime(BaseEstimator, TransformerMixin):
    def fit(self, X, y): return self

    def transform(self, X):
        res = []
        for e in X:
            res.append({'runtime': e['runtime'] / max_runtime})
        return res



In [29]:
Runtime().transform(train_docs[:5])

[{'runtime': 0.54},
 {'runtime': 0.6933333333333334},
 {'runtime': 0.8466666666666667},
 {'runtime': 0.7066666666666667},
 {'runtime': 0.7066666666666667}]

#####Regresión Logística

In [30]:
pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False))        
    ),
    LinearRegression()
)

pipe.fit(train_docs, y_train);

In [31]:
test_pipe(pipe)

{'test_mape': 11.05, 'test_r2': 0.37, 'train_mape': 9.33, 'train_r2': 0.36}

#####Red Neuronal

In [32]:
from sklearn.neural_network import MLPRegressor

pipe = make_pipeline(
    make_union(
        make_pipeline(YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(GenreDummies(), DictVectorizer(sparse=False)),
        make_pipeline(CountryDummies(), DictVectorizer(sparse=False)),
        make_pipeline(Runtime(), DictVectorizer(sparse=False))        
    ),
    MLPRegressor(random_state=21, max_iter=500)
)

pipe.fit(train_docs, y_train);

In [33]:
test_pipe(pipe)

{'test_mape': 11.12, 'test_r2': 0.33, 'train_mape': 8.73, 'train_r2': 0.42}