In [278]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns


In [279]:
movies = pd.read_csv("data/movies_3.csv", sep=",")
directors = pd.read_csv("data/directors.csv", sep=",")
ml_genres = pd.read_csv("data/ml_genres.csv", sep=",")

In [280]:
ml_genres

Unnamed: 0,title,id_movie,genres
0,!Women Art Revolution,55245,Documentary
1,#1 Cheerleader Camp,41371,Comedy Drama
2,#Chicagogirl,267752,Documentary
3,#Horror,301325,Drama Mystery Horror Thriller
4,"$1,000 On The Black",143747,Western
...,...,...,...
42944,Юленька,44839,Thriller Drama Horror
42945,هیچ کجا هیچ کس,388182,Mystery Crime Drama
42946,’Round Midnight,14670,Drama
42947,…And The Fifth Horseman Is Fear,109380,Drama War


In [281]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45333 entries, 0 to 45332
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id_movie           45333 non-null  int64  
 1   title              45333 non-null  object 
 2   tagline            20381 non-null  object 
 3   overview           44392 non-null  object 
 4   runtime            45087 non-null  float64
 5   release_date       45333 non-null  object 
 6   release_year       45333 non-null  int64  
 7   status             45253 non-null  object 
 8   original_language  45322 non-null  object 
 9   popularity         45333 non-null  float64
 10  vote_average       45333 non-null  int64  
 11  vote_count         45333 non-null  int64  
 12  budget             45333 non-null  int64  
 13  revenue            45333 non-null  int64  
 14  return             45333 non-null  float64
 15  name_collection    4376 non-null   object 
 16  id_collection      448

In [282]:
movies.describe()

Unnamed: 0,id_movie,runtime,release_year,popularity,vote_average,vote_count,budget,revenue,return,id_collection
count,45333.0,45087.0,45333.0,45333.0,45333.0,45333.0,45333.0,45333.0,45333.0,4483.0
mean,108058.249642,94.175882,1991.882536,2.925689,5.250369,110.149207,4230989.0,11233040.0,157.8364,184212.998885
std,112196.879634,38.351846,24.053205,6.011321,1.858919,491.967458,17444520.0,64418130.0,13136.33,141638.033338
min,2.0,0.0,1874.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
25%,26398.0,85.0,1978.0,0.388631,5.0,3.0,0.0,0.0,0.0,86027.5
50%,59853.0,95.0,2001.0,1.129891,6.0,10.0,0.0,0.0,0.0,141615.0
75%,156700.0,107.0,2010.0,3.687905,6.0,34.0,0.0,0.0,0.0,294179.5
max,469172.0,1256.0,2020.0,547.488298,10.0,14075.0,380000000.0,2787965000.0,1916667.0,480160.0


Podemos ver que mas del 75% de las peliculas tienen un return = 0 por lo que utilizaremos este dato para filtrar las peliculas con las que entrenaremos nuestro modelo.

In [283]:
movies = movies[movies['popularity'] > 3.687905]
movies.shape

(11333, 17)

In [284]:
ml_genres.columns

Index(['title', 'id_movie', 'genres'], dtype='object')

In [285]:
directors.columns

Index(['id_director', 'name', 'id_movie'], dtype='object')

In [286]:
sist_recomendacion_df =(
     pd.merge(
         ml_genres,
         directors,
         on = 'id_movie',
         how='inner')
     )

In [287]:
sist_recomendacion_df = sist_recomendacion_df[sist_recomendacion_df.id_movie.isin(movies.id_movie.tolist())].reset_index(drop =True)

In [288]:
print(
    sist_recomendacion_df
    .duplicated()
    .sum()
)

0


In [289]:
print(
    sist_recomendacion_df
    .isnull()
    .sum()
)
print (sist_recomendacion_df.shape)

title          0
id_movie       0
genres         0
id_director    0
name           0
dtype: int64
(12392, 5)


In [290]:
sist_recomendacion_df['title'] = sist_recomendacion_df['title'].astype('str').str.title().str.strip()
sist_recomendacion_df['genres']= sist_recomendacion_df ['genres'].astype('str')
sist_recomendacion_df['name'] = sist_recomendacion_df['name'].astype('str')

sist_recomendacion_df['text'] = sist_recomendacion_df['name']+  sist_recomendacion_df['genres']

df = sist_recomendacion_df[['title', 'text']]

df.to_csv("data/df.csv",index=False)

In [291]:
df

Unnamed: 0,title,text
0,'71,Yann DemangeThriller Action Drama War
1,(500) Days Of Summer,Marc WebbComedy Drama Romance
2,(Untitled),Jonathan ParkerDrama Comedy Romance
3,+1,Dennis IliadisThriller Science Fiction
4,...And Justice For All,Norman JewisonCrime Drama Mystery Thriller
...,...,...
12387,[Rec]²,Paco PlazaThriller Horror
12388,[Rec]³ Genesis,Paco PlazaHorror
12389,[Rec]⁴ Apocalypse,Jaume BalagueróThriller Horror
12390,¡Three Amigos!,John LandisComedy Western


---

# Sistema de Recomendación

In [292]:
# Instanciamos el CV
vectorizer = CountVectorizer()
stopwords = STOPWORDS
# eliminamos las "stop words", palabras comunes no informativas
tf = TfidfVectorizer(stop_words='english')

# calculamos los features para cada ítem (texto)
tfidf_matrix = tf.fit_transform(df['text'])

# calculamos las similitudes entre todos los documentos
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
n = 5

results = {} 
for idx, row in df.iterrows():
    # guardamos los indices similares basados en la similitud coseno. Los ordenamos en modo ascendente, siendo 0 nada de similitud y 1 total
    similar_indices = cosine_similarities[idx].argsort()[:-n-2:-1] 
    # guardamos los N más cercanos
    similar_items = [(f"{df.loc[i, 'title']}") for i in similar_indices]
    results[f"{row['title']}"] = similar_items[1:]


In [293]:
pprint(results['Toy Story'])
pprint(results['Cars 2'])
pprint(results["Titanic"])

['Toy Story', 'Mater And The Ghostlight', 'Cars', 'Cars 2', 'Tin Toy']
["Lemony Snicket'S A Series Of Unfortunate Events",
 'The Incredibles',
 'Casper',
 'Ratatouille',
 'Mission: Impossible - Ghost Protocol']
['Johnny Belinda',
 'How To Marry A Millionaire',
 'Torn Apart',
 'The Rules Of The Game',
 'Beauty And The Beast']


-----------

In [296]:
# ML
# @app.get('/recomendacion/{titulo}')
def recomendacion(titulo:str):
    '''Ingresas un nombre de pelicula y te recomienda las similares en una lista'''
    if df['title'].str.contains(titulo).any():
        titulo = titulo.title().strip()
        lista = (results[titulo])
        data = {'titulo':titulo , 'lista recomendada': lista}
    else:
        mensaje = "La pelicula {} no se encuentra en la base de datos.".format(titulo)
        data = {'actor':[mensaje] }    
    return data

In [297]:
recomendacion('Toy Story 2')

{'titulo': 'Toy Story 2',
 'lista recomendada': ['Toy Story',
  'Mater And The Ghostlight',
  'Cars',
  'Cars 2',
  'Tin Toy']}