# Proyecto ML

## Wikipedia Movie Plots

Importa las librarías que se usaran

In [0]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.warnings.filterwarnings('ignore')

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.compat.v1 as tfv1
import tensorflow_hub as hub

tf.logging.set_verbosity(tf.logging.ERROR)

from tqdm import tqdm_notebook as tqdm

Carga los datos (Wikipedia Movie Plots https://www.kaggle.com/jrobischon/wikipedia-movie-plots, Movie Lens: https://grouplens.org/datasets/movielens/)

In [0]:
wiki_plots = pd.read_csv("wiki_movie_plots_deduped.csv")
wiki_plots.rename(columns={'Origin/Ethnicity':'Origin'}, inplace=True) # para mayor comodidad
wiki_plots = wiki_plots.drop_duplicates(subset='Plot', keep='first')
wiki_plots.shape

(33869, 8)

### Descripción de las columnas

La base de datos contiene descripciones para 34,886 películas de todo el mundo. las columnas son las siguientes:

- Release Year - Año en que se estreno la película
- Title - Titulo de la película
- Origin/Ethnicity - Origen de la pelicula (e.g. America, Bollywood, Tamil, etc)
- Director - Director(s)
- Cast - Actores y actrices principales
- Genre - Genero(s) de la película
- Wiki Page - URL de la pagina de Wikipedia de la cual la descripción de la trama fue tomada
- Plot - descripción larga de la trama de la película (ADVERTENCIA: puede contener espoilers)

In [0]:
wiki_plots.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33869 entries, 0 to 34885
Data columns (total 10 columns):
Release Year     33869 non-null int64
Title            33869 non-null object
Origin           33869 non-null object
Director         33869 non-null object
Cast             32554 non-null object
Genre            33869 non-null object
Wiki Page        33869 non-null object
Plot             33869 non-null object
FormattedPlot    33869 non-null object
EmbeddingPlot    33869 non-null object
dtypes: int64(1), object(9)
memory usage: 2.8+ MB


Limpieza de los textos

In [0]:
def clean_text(text):
    """
    This function is taken from:
    https://www.kaggle.com/aminejallouli/genre-classification-based-on-wiki-movies-plots
    """
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", "", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\[\d+\]", "", text)
    text = re.sub('[%s\d]' % re.escape("""!"#$%&'()*+,-.:;<=>?@[\]^`{|}~"""), "", text)
    text = re.sub(r"[-\n]", " ", text)
    text = text.strip(' ')
    return text

In [0]:
wiki_plots['FormattedPlot'] = wiki_plots['Plot'].apply(clean_text)

## Codificación de las tramas en vectores densos

Aqui realizaremos una pequeña prueba para ver el poder de la los vectores codificados, buscando películas similares usando la distancia coseno

Función para codificar las tramas en vectores densos de 512

In [0]:
def get_embeddings(tokenized_sents):
    embed_list = []
    with tfv1.Graph().as_default():
        print("Downloading the model")
        embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
        print("Processing plots")
        messages = tfv1.placeholder(dtype=tf.string, shape=[None])
        embed_list = []
        embedding = embed(messages)
        embedding_mean = tfv1.math.reduce_mean(embedding, axis=0)
        normalized_embedding = tfv1.nn.l2_normalize(embedding_mean, 0)
        with tfv1.Session() as session:
            session.run([tfv1.global_variables_initializer(), tfv1.tables_initializer()])
            for sent in tqdm(tokenized_sents):
                output = session.run(embedding_mean, feed_dict={messages: sent})
                # this is slow
                # encoded_plot = session.run(tfv1.math.reduce_mean(output, axis=0))
                embed_list.append(output.reshape(-1,1))
    return embed_list

In [0]:
print("Tokenizig sentences in the plots")
tokenized_sents = []
for movie_plot in tqdm(wiki_plots['FormattedPlot']):
    sent = sent_tokenize(movie_plot)
    tokenized_sents.append(sent)

embed_list = get_embeddings(tokenized_sents)

Tokenizig sentences in the plots


HBox(children=(IntProgress(value=0, max=33869), HTML(value='')))


Downloading the model
Processing plots


HBox(children=(IntProgress(value=0, max=33869), HTML(value='')))




In [0]:
wiki_plots['EmbeddingPlot'] = embed_list

Busquedad de peliculas similares usando la distancia coseno

In [0]:
def get_similar_movies(embedding_plot, embeddings, titles, min_similarity=0.5):
    similar_movies = []
    embeddings = np.array(embeddings.tolist()).squeeze()
    with tfv1.Session() as session:
        vec1 = tfv1.placeholder(dtype=tf.float32, shape=(None, 512), name="vec1")
        vec2 = tfv1.placeholder(dtype=tf.float32, shape=(512, 1), name="vec2")

        cos_similarity = tf.matmul(vec1, vec2)

        similarities = session.run(cos_similarity, feed_dict={vec1: embeddings, vec2: embedding_plot})
        similarities = [x for x in zip(similarities, titles) if x[0] > min_similarity]
        similarities.sort(key=lambda x: x[0], reverse=True)
        similarities.pop(0)
        similar_movies = [title for (_, title) in similarities]
    return similar_movies

Busqueda de peliculas similares a "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe"

In [0]:
embed_plot = wiki_plots.query('Title == "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe"').iloc[0]['EmbeddingPlot']
print(get_similar_movies(embed_plot, wiki_plots['EmbeddingPlot'].values, wiki_plots['Title'], min_similarity=0.8))

['Troll', 'Jack the Giant Killer', 'The Witches', 'Hansel & Gretel: Witch Hunters', "Happily N'Ever After", 'Inkheart', 'Snow White & the Huntsman', 'The Devil Rides Out', 'The Raven', 'Hocus Pocus', 'The Wizard of Oz', 'A Simple Wish', 'Mirror Mirror', 'The Chronicles of Narnia: Prince Caspian', 'Just Visiting', 'Howling II', 'The Spiderwick Chronicles', 'Tom Thumb', 'The Chronicles of Narnia: The Voyage of the Dawn Treader', 'The Wicker Man', 'The Swan Princess', 'The Maze', 'Cry of the Banshee', 'Pan', 'The Brothers Grimm', 'Peter Pan', 'Snow White and the Three Stooges', 'Leprechaun: Origins', 'The City of the Dead', 'The Vampire Lovers', "Mary and the Witch's Flower", 'The Haunted Palace', 'Black Cauldron, The', 'Happily Ever After', 'Dracula: Prince of Darkness', 'Lesbian Vampire Killers', 'Sleeping Beauty', 'Snow White: A Tale of Terror', ' Ella Enchanted', 'The Swan Princess Christmas']


## Sistema de recomendación

despues de la prueba con las tramas codificadas es hora de un sistema mas sofisticado, para ello utilizaremos otro dataset, el cual contiene datos de cerca de 150 mil usuarios que han calificado mas de 25 mil películas

In [0]:
movie_lens = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
movie_lens = movie_lens.drop_duplicates(subset='title', keep='first')

In [0]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


removemos el año del titulo de cada película

In [0]:
clear_title = lambda x: re.sub(r' \(\d{4}\)', '', x)
movie_lens['title'] = movie_lens['title'].apply(clear_title)
movie_lens.rename(columns={'title':'Title'}, inplace=True)
movie_lens.head(10)

Unnamed: 0,movieId,Title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy
5,6,Heat,Action|Crime|Thriller
6,7,Sabrina,Comedy|Romance
7,8,Tom and Huck,Adventure|Children
8,9,Sudden Death,Action
9,10,GoldenEye,Action|Adventure|Thriller


Filtramos solo las películas que se encuentran en ambos datasets y concatenamos los embeddings a moviel_lens

In [0]:
movie_lens_reduced = movie_lens.merge(wiki_plots[['Title', 'EmbeddingPlot']], on='Title')
movie_lens_reduced = movie_lens_reduced.drop_duplicates(subset='Title', keep='first')

En esta parte hacemos reset a los indices de las películas, ya que despues de filtrar puede que hayan sido removidos algunos indices

In [0]:
remap_movies = {movie_id: index for movie_id, index in 
                zip(
                    movie_lens_reduced['movieId'], 
                    range(len(movie_lens_reduced['movieId']))
                    )
                }

movie_lens_reduced['movieId'] = movie_lens_reduced['movieId'].apply(lambda id: remap_movies[id])
movie_lens_reduced

Unnamed: 0,movieId,Title,genres,EmbeddingPlot
0,0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[[0.007755052], [0.0117758075], [-0.04774074],..."
1,1,Jumanji,Adventure|Children|Fantasy,"[[-0.056118418], [0.041826643], [-0.027685331]..."
2,2,Grumpier Old Men,Comedy|Romance,"[[-0.031014366], [0.027537223], [-0.030267373]..."
3,3,Waiting to Exhale,Comedy|Drama|Romance,"[[-0.002458289], [0.022208316], [-0.0334218], ..."
4,4,Father of the Bride Part II,Comedy,"[[-0.043547787], [0.022245584], [-0.008738409]..."
...,...,...,...,...
19429,13839,Kaithi,Action|Thriller,"[[-0.05050796], [-0.045087587], [-0.03906933],..."
19430,13840,Hockey Night,Drama,"[[-0.062278718], [0.014847355], [-0.074892215]..."
19431,13841,Hurricane Smith,Action|Drama,"[[-0.03429976], [0.046135724], [-0.03773823], ..."
19432,13842,The Specials,Comedy,"[[-0.03073092], [0.056634538], [-0.05494778], ..."


Como modificamos los indices a al data frame de las películas tenemos que re mapear los indices viejos a los nuevos

In [0]:
def check_id(id):
    return id in remap_movies

ratings['valid'] = ratings['movieId'].apply(check_id)
ratings = ratings.loc[ratings['valid'] == True]
ratings['movieId'] = ratings['movieId'].apply(lambda id: remap_movies[id])

Filtramos a los usuarios que hayan hecho por lo menos hayan interactuado con 9 películas

In [0]:
users = ratings['userId'].value_counts()
users = users[users > 8]
ratings = ratings.loc[np.isin(ratings['userId'].values, users.index)]
len(ratings)

15303798

Ahora nos aseguramos de que los indices de los usuarios no tengan brincos, es decir que si hay n indices distintos los indices vayan del 0 al n sin faltar ningun numero

In [0]:
remap_users = {user_id: index for user_id, index in
               zip(
                   users.index, 
                   range(len(users.index))
                   )
               }

ratings['userId'] = ratings['userId'].apply(lambda id: remap_users[id])

In [0]:
embed_list = np.array(movie_lens_reduced['EmbeddingPlot'].tolist())
embed_list = embed_list.squeeze()
embed_list.shape

(13844, 512)

Partimos el dataset en training y test con proporción de 80/20 y luego los dividimos en batches

In [0]:
batch_size = 16384
r_train, r_test = train_test_split(ratings, train_size=0.8)
print(r_train.shape)
r_train_batches = np.array_split(r_train, len(r_train) // batch_size, axis=0)
r_test_batches = np.array_split(r_test, len(r_test) // batch_size, axis=0)

(12243038, 5)


Definimos nuestro modelo

In [0]:
def prepare_model(num_users, num_items, embedding_size, name):
    model = {}
    model['name'] = name
    with tf.name_scope(name):
        model['U'] = tf.Variable(
            tf.random_normal([num_users, embedding_size]), 
            dtype=tf.float32, 
            name="weights"
        )
        model['V'] = tfv1.placeholder(shape=(num_items, embedding_size), dtype=tf.float32, name="embeddings")
        model['u_ids'] = tfv1.placeholder(shape=(None), dtype=tf.int32, name="u_ids")
        model['i_ids'] = tfv1.placeholder(shape=(None), dtype=tf.int32, name="i_ids")
        model['ratings'] = tfv1.placeholder(shape=(None), dtype=tf.float32, name="ratings")
        model['saver'] = tfv1.train.Saver([model['U']])
        model['saved'] = False
    return model

In [0]:
model = prepare_model(len(users), len(embed_list), 512, "model")

In [0]:
def feed(model):
    gathered_u = tf.gather(model['U'], model['u_ids'])
    gathered_i = tf.gather(model['V'], model['i_ids'])
    return tf.reduce_sum(tf.multiply(gathered_u, gathered_i), axis=1)

def loss(model, reg_weight):
    pred = feed(model)
    regularizer = tf.nn.l2_loss(model['U'])
    regularizer = tf.multiply(reg_weight, regularizer)
    return tf.add(tf.losses.mean_squared_error(model['ratings'], pred), regularizer)2

def optimizer(loss, learning_rate):
    return tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

def train(model, train_data, embed_list, learning_rate, epochs, reg_weight):
    train_loss = loss(model, reg_weight)
    opt_step = optimizer(train_loss, learning_rate)
    error = 0
    if not os.path.isdir(model['name']):
        os.mkdir(model['name'])
    with tf.Session() as session:
        session.run(tfv1.global_variables_initializer())
        if model['saved']:
            print('Restoring weights')
            model['saver'].restore(session, model['name']+'/checkpoint')
        print('Starting training')
        for epoch in range(epochs):
            error = 0
            print(f'Training epoch {epoch+1}')
            for batch in tqdm(train_data):
                batch_error, _ = session.run([train_loss, opt_step], feed_dict={
                    model['V']: embed_list,
                    model['u_ids']: batch['userId'].values,
                    model['i_ids']: batch['movieId'].values,
                    model['ratings']: batch['rating'].values
                })
                error += batch_error
            print(f'Epoch {epoch+1} loss: {error/len(train_data)}')
            model['saver'].save(session, model['name']+'/checkpoint', global_step=epoch)
            if not model['saved']:
                model['saved'] = True
    print(f'Final loss: {error/len(train_data)}')

Ajustamos el modelo a los datos

In [0]:
train(model, r_train_batches, embed_list, 0.001, 8, 0.000005)
#len(r_train_batches)

Starting training
Training epoch 1


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 1 loss: 146.8550079121009
Training epoch 2


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 2 loss: 61.202769278203306
Training epoch 3


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 3 loss: 25.61131557786321
Training epoch 4


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 4 loss: 11.895765902048135
Training epoch 5


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 5 loss: 7.152279732536918
Training epoch 6


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 6 loss: 5.697089185038085
Training epoch 7


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 7 loss: 5.30249046417604
Training epoch 8


HBox(children=(IntProgress(value=0, max=747), HTML(value='')))


Epoch 8 loss: 5.207425282821758
Final loss: 5.207425282821758


Evaluamos el modelo en el test set

In [0]:
test_loss = loss(model, 0.000005)
error = 0
with tfv1.Session() as session:
    session.run([tfv1.global_variables_initializer(), tfv1.local_variables_initializer()])
    model['saver'].restore(session, tf.train.latest_checkpoint(model['name']))
    for batch in r_test_batches:
        error += session.run(test_loss, feed_dict={
            model['V']: embed_list,
            model['u_ids']: batch['userId'].values,
            model['i_ids']: batch['movieId'].values,
            model['ratings']: batch['rating'].values
        })
    print(f'Test loss: {error/len(r_test_batches)}')

Test loss: 5.250031596870833


Seleccionamos 25 ratings aleatorios y tratamos de estimar el valor del rating dado el id del usuario y del la película

In [0]:
def predict(model, user_ids, movie_ids, embed_list):
    out = feed(model)
    with tfv1.Session() as session:
        session.run([tfv1.global_variables_initializer(), tfv1.local_variables_initializer()])
        model['saver'].restore(session, tf.train.latest_checkpoint(model['name']))
        pred = session.run(out, feed_dict={
            model['V']: embed_list,
            model['u_ids']: user_ids,
            model['i_ids']: movie_ids,
        })
    return pred

In [0]:
indices = random.choices(list(r_test.index), k=25)
batch = r_test.loc[indices]
pred = predict(model, batch['userId'].values, batch['movieId'].values, embed_list)
result = pd.DataFrame(zip(batch['movieId'], batch['rating'], pred), columns=['movieId', 'Real', 'Predicción'])
result = result.merge(movie_lens_reduced[['movieId', 'Title']], on='movieId')
result
#print(r_test.loc[i, ['userId', 'movieId', 'rating']])

Unnamed: 0,movieId,Real,Predicción,Title
0,9274,4.5,3.699524,Run All Night
1,1032,1.5,2.195683,Saving Private Ryan
2,400,4.0,1.683317,Dr. Strangelove or: How I Learned to Stop Worr...
3,543,4.0,1.487666,Die Hard
4,2208,2.5,1.292135,Shrek
5,683,4.0,3.703663,Shine
6,946,2.0,2.336115,Six Days Seven Nights
7,579,4.0,3.052016,Top Gun
8,1534,5.0,3.368618,Time Bandits
9,1307,4.0,2.062055,Office Space


Seleccionamos un usuario aleatorio y tratamos de recomendarle las peliculas con un mayor rating predicho en base a las otras películas que el evaluo

In [0]:
def recomend(model, user_id, movies, embed_list, ratings, k=10):
    rated = ratings.loc[ratings['userId'] == user_id]['movieId'].values
    not_rated = movies.loc[np.in1d(movies['movieId'], rated, invert=True)]
    user_id = np.array([user_id]*len(not_rated))
    pred_ratings = predict(model, user_id, not_rated['movieId'].values, embed_list)
    recomendations = [x for x in zip(pred_ratings, not_rated['Title'].values)]
    recomendations.sort(key=lambda x: x[0], reverse=True)
    return [title for _, title in recomendations][:k]

In [0]:
index = random.choice(users.index)
rated = ratings.loc[ratings['userId'] == index]
movies_rated = movie_lens.loc[np.in1d(movie_lens['movieId'], rated['movieId'].values)]
liked = sorted([x for x in zip(rated['rating'].values, movies_rated['Title'].values) if x[0] > 3], key=lambda x: x[0], reverse=True)
liked = [title for _, title in liked]
print(f'Películas que le gustaron al usuario {index}:')
print(liked)
print('películas recomendadas')
print(recomend(model, index, movie_lens_reduced, embed_list, ratings))

Películas que le gustaron al usuario 103627:
['Stargate', 'Century', 'Honey, I Blew Up the Kid', 'Milk Money', 'Backbeat', 'Homage', 'Peanuts - Die Bank zahlt alles', 'Convent, The (O Convento)', 'Crows and Sparrows (Wuya yu maque)', 'All Over Me', "You Can't Take It with You", 'Devil and Max Devlin, The', 'Permanent Midnight', 'Kindred, The']
películas recomendadas
['Coneheads', 'Heavy Metal', 'Some Girls Do', 'Outland', 'Cypher', 'Teenagers from Outer Space', 'Snowpiercer', 'Freejack', 'Escape from Planet Earth', 'Hostage']
