In [47]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns


In [86]:
movies = pd.read_csv("data/movies_etl.csv", sep=",")
directors = pd.read_csv("data/directors.csv", sep=",")

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45333 entries, 0 to 45332
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id_movie           45333 non-null  int64  
 1   title              45333 non-null  object 
 2   tagline            20381 non-null  object 
 3   overview           44392 non-null  object 
 4   original_language  45322 non-null  object 
 5   runtime            45087 non-null  float64
 6   status             45253 non-null  object 
 7   release_date       45333 non-null  object 
 8   release_year       45333 non-null  int64  
 9   budget             45333 non-null  int64  
 10  revenue            45333 non-null  int64  
 11  return             45333 non-null  float64
 12  vote_count         45333 non-null  int64  
 13  vote_average       45333 non-null  int64  
 14  popularity         45333 non-null  float64
 15  id_collection      4483 non-null   float64
dtypes: float64(4), int64(6

In [25]:
movies.describe()

Unnamed: 0,id_movie,runtime,release_year,budget,revenue,return,vote_count,vote_average,popularity,id_collection
count,45333.0,45087.0,45333.0,45333.0,45333.0,45333.0,45333.0,45333.0,45333.0,4483.0
mean,108058.249642,94.175882,1991.882536,4230989.0,11233040.0,157.8364,110.149207,5.250369,2.925689,184212.998885
std,112196.879634,38.351846,24.053205,17444520.0,64418130.0,13136.33,491.967458,1.858919,6.011321,141638.033338
min,2.0,0.0,1874.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
25%,26398.0,85.0,1978.0,0.0,0.0,0.0,3.0,5.0,0.388631,86027.5
50%,59853.0,95.0,2001.0,0.0,0.0,0.0,10.0,6.0,1.129891,141615.0
75%,156700.0,107.0,2010.0,0.0,0.0,0.0,34.0,6.0,3.687905,294179.5
max,469172.0,1256.0,2020.0,380000000.0,2787965000.0,1916667.0,14075.0,10.0,547.488298,480160.0


Podemos ver que mas del 75% de las peliculas tienen un return = 0 por lo que utilizaremos este dato para filtrar las peliculas con las que entrenaremos nuestro modelo.

In [87]:
movies = movies[movies['return'] != 0]
movies.shape

(5369, 16)

In [88]:
sist_recomendacion_df = (
     pd.merge(
         movies[['title','overview','id_movie']],
         directors,
         on = 'id_movie',
         how='inner')
     )

# sist_recomendacion_df

In [89]:
sist_recomendacion_df = sist_recomendacion_df.groupby(['title','overview'], as_index=False)['name'].agg(' '.join)
sist_recomendacion_df

Unnamed: 0,title,overview,name
0,(500) Days Of Summer,"Tom (Joseph Gordon-Levitt), greeting-card writ...",Mychael Danna Hope Hanafin Steven J. Wolfe Mas...
1,10 Cloverfield Lane,"After a car accident, Michelle awakens to find...",Monika Mikkelsen J.J. Abrams Matthew W. Mungle...
2,10 Things I Hate About You,"Bianca, a tenth grader, has never gone on a da...",Charles Graffeo William Shakespeare Mark Irwin...
3,10 To Midnight,"Based on the real-life Richard Speck murders, ...",Adam Greenberg William Roberts J. Lee Thompson...
4,"10,000 Bc",A prehistoric epic that follows a young mammot...,Sarah Bradshaw Mali Finn Alexander Berner Tom ...
...,...,...,...
5349,Zulu,"In 1879, during the Zulu wars, man of the peop...",Cy Endfield John Barry Stanley Baker Stephen D...
5350,Zyzzyx Road,A married man (Leo Grillo) meets a beautiful w...,Valerie McCaffrey David Klein Ryan Beveridge J...
5351,[Rec],A television reporter and cameraman follow eme...,Julio Fernández David Gallart Carlos Fernández...
5352,[Rec]²,"The action continues from [REC], with the medi...",Jaume Balagueró Paco Plaza Manu Díez


In [90]:
print(
    sist_recomendacion_df
    .isnull()
    .sum()
)
print (sist_recomendacion_df.shape)

title       0
overview    0
name        0
dtype: int64
(5354, 3)


In [91]:
sist_recomendacion_df['title'] = sist_recomendacion_df['title'].astype('str').str.title().str.strip()
sist_recomendacion_df['overview']= sist_recomendacion_df ['overview'].astype('str')
sist_recomendacion_df['name'] = sist_recomendacion_df['name'].astype('str')

sist_recomendacion_df['text'] = sist_recomendacion_df['overview'] +  sist_recomendacion_df['name']

df = sist_recomendacion_df[['title', 'text']]

df.to_csv("data/ml_df.csv",index=False)

---

# Sistema de Recomendación

In [92]:
# Instanciamos el CV
vectorizer = CountVectorizer()
stopwords = STOPWORDS
# eliminamos las "stop words", palabras comunes no informativas
tf = TfidfVectorizer(stop_words='english')

# calculamos los features para cada ítem (texto)
tfidf_matrix = tf.fit_transform(df['text'])

# calculamos las similitudes entre todos los documentos
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
n = 5

results = {} 
for idx, row in df.iterrows():
    # guardamos los indices similares basados en la similitud coseno. Los ordenamos en modo ascendente, siendo 0 nada de similitud y 1 total
    similar_indices = cosine_similarities[idx].argsort()[:-n-2:-1] 
    # guardamos los N más cercanos
    similar_items = [(f"{df.loc[i, 'title']}") for i in similar_indices]
    results[f"{row['title']}"] = similar_items[1:]


In [93]:
pprint(results['Toy Story'])
pprint(results['Toy Story 2'])
pprint(results['Cars'])

['Toy Story 2', 'Monsters, Inc.', 'Toy Story 3', 'Cars', 'Finding Nemo']
['Monsters, Inc.', 'Toy Story', 'Finding Nemo', 'Up', 'Toy Story 3']
['Cars 2', 'Monsters, Inc.', 'Toy Story 2', 'Cars 3', 'Up']


-----------

In [33]:
# ML
# @app.get('/recomendacion/{titulo}')
def recomendacion(titulo:str):
    '''Ingresas un nombre de pelicula y te recomienda las similares en una lista'''

    titulo = titulo.title().strip()
    lista = (results[titulo])
    
    return {'lista recomendada': lista}

In [34]:
recomendacion('Toy Story')

{'lista recomendada': ['Toy Story 2',
  'Monsters, Inc.',
  'Toy Story 3',
  'Cars',
  'Finding Nemo']}