In [14]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from pprint import pprint

In [15]:
movies = pd.read_csv("data/movies_etl.csv", sep=",")
directors = pd.read_csv("data/directors.csv", sep=",")

In [17]:
sist_recomendacion_df = (
     pd.merge(
         movies[['title','overview','id_movie']],
         directors,
         on = 'id_movie',
         how='inner')
     )

# sist_recomendacion_df

In [18]:
sist_recomendacion_df = sist_recomendacion_df.groupby(['title','overview'], as_index=False)['name'].agg(' '.join)
sist_recomendacion_df

Unnamed: 0,title,overview,name
0,!Women Art Revolution,"Through intimate interviews, provocative art, ...",Lynn Hershman Leeson Ariel Dougherty
1,#1 Cheerleader Camp,A pair of horny college guys get summer jobs a...,Mark Quod David Michael Latt Chris Ridenhour D...
2,#Chicagogirl,From her childhood bedroom in the Chicago subu...,Joe Piscatella Mark Rinehart Jeff Castelluccio
3,#Horror,"Inspired by actual events, a group of 12 year ...",Tara Subkoff Jason Ludman Oren Segal Brendan W...
4,"$1,000 On The Black",Johnny Liston has just been released from pris...,Mario Siciliano Rolf Olsen Ernesto Gastaldi An...
...,...,...,...
43713,Юленька,"A university professor, who wants to slow down...",Aleksandr Strizhenov Andrey Kureychik Grigori ...
43714,‘Rameau’S Nephew’ By Diderot (Thanx To Dennis ...,Described (rather cheekily) by director Michae...,Michael Snow
43715,’Round Midnight,Inside the Blue Note nightclub one night in 19...,Amanda Mackey Alexandre Trauner Herbie Hancock...
43716,…And The Fifth Horseman Is Fear,A Jewish doctor helps a political fugitive dur...,Miroslav Pesan Zbynek Brynych Milan Nejedlý Ha...


In [19]:
print(
    sist_recomendacion_df
    .isnull()
    .sum()
)
print (sist_recomendacion_df.shape)

title       0
overview    0
name        0
dtype: int64
(43718, 3)


In [26]:
sist_recomendacion_df['title'] = sist_recomendacion_df['title'].astype('str').str.title().str.strip()
sist_recomendacion_df['overview']= sist_recomendacion_df ['overview'].astype('str')
sist_recomendacion_df['name'] = sist_recomendacion_df['name'].astype('str')

sist_recomendacion_df['text'] = sist_recomendacion_df['overview'] +  sist_recomendacion_df['name']

df = sist_recomendacion_df[['title', 'text']]

df.to_csv("data/ml_df.csv",index=False)

---

# Sistema de Recomendación

In [21]:
# Instanciamos el CV
vectorizer = CountVectorizer()
stopwords = STOPWORDS
# eliminamos las "stop words", palabras comunes no informativas
tf = TfidfVectorizer(stop_words='english')

# calculamos los features para cada ítem (texto)
tfidf_matrix = tf.fit_transform(df['text'])

# calculamos las similitudes entre todos los documentos
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
n = 5

results = {} 
for idx, row in df.iterrows():
    # guardamos los indices similares basados en la similitud coseno. Los ordenamos en modo ascendente, siendo 0 nada de similitud y 1 total
    similar_indices = cosine_similarities[idx].argsort()[:-n-2:-1] 
    # guardamos los N más cercanos
    similar_items = [(f"{df.loc[i, 'title']}") for i in similar_indices]
    results[f"{row['title']}"] = similar_items[1:]


In [24]:
pprint(results['Toy Story'])
pprint(results['Toy Story 2'])
pprint(results['Cars'])

['Toy Story 2', 'Monsters, Inc.', 'Toy Story 3', 'Cars', 'Finding Nemo']
['Monsters, Inc.', 'Toy Story', 'Finding Nemo', 'Up', 'Toy Story 3']
['Cars 2', 'Monsters, Inc.', 'Toy Story 2', 'Cars 3', "Geri'S Game"]


-----------

In [33]:
# ML
# @app.get('/recomendacion/{titulo}')
def recomendacion(titulo:str):
    '''Ingresas un nombre de pelicula y te recomienda las similares en una lista'''

    titulo = titulo.title().strip()
    lista = (results[titulo])
    
    return {'lista recomendada': lista}

In [34]:
recomendacion('Toy Story')

{'lista recomendada': ['Toy Story 2',
  'Monsters, Inc.',
  'Toy Story 3',
  'Cars',
  'Finding Nemo']}