In [289]:
from csv import reader
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import operator 

#  Selecionando colunas que categorizam/dão valor/representem  a/o filme

In [371]:
movie_columns = ["movieId", "rating", "genres", "title"]
movies_df = pd.read_parquet('../results/movie.parquet')[movie_columns]
movies_df = movies_df.drop_duplicates()

# Existem no total 59047 movieId distintos

In [372]:
movies_df["movieId"].nunique()

59047

# Tento agrupar o movieId com genero e titulo, para reduzir o valor de 25M para 59 mil. Tento tirar a media das avaliações dos filmes que os usuarios fizeram e faço uma normalização da quantidade de votos que o filme recebeu

In [373]:
movies_df = movies_df.groupby(['movieId', "genres", 'title']).agg(size_rating=("rating", np.size), 
                                                        mean_rating=("rating", np.mean)).reset_index()
movies_df["size_rating"] = movies_df["size_rating"].apply(
                                            lambda x: 
                                                    (x - movies_df["size_rating"].min()) / 
                                                    (movies_df["size_rating"].max() - movies_df["size_rating"].min())
)

# Faço um dicionario dos titulos e generos, para poder recuperar no final

In [374]:
catalog = {}

catalog["title"] = dict(zip(movies_df.movieId, movies_df.title))
catalog["genre"] = dict(zip(movies_df.movieId, movies_df.genres))

# Faço um explode na coluna genres e deixo tudo em uma unica linha. a categoria "(no genres listed)" foi removida pois nas outras categorias vão aparecer como 0

In [375]:
movies_df = movies_df.assign(genres=movies_df['genres'].str.split('|')).explode('genres')
genre_list = list(movies_df['genres'].unique())
movies_df = pd.concat([movies_df.drop('genres', 1), pd.get_dummies(movies_df.genres).mul(1)], axis=1)
movies_df = movies_df.drop(columns=['(no genres listed)'])
genre_list.remove("(no genres listed)")

# Considero apenas as colunas numericas para poder medir a distancia euclidiana

In [376]:
movies_df = movies_df.groupby(['movieId', 'title', 'size_rating', 'mean_rating'])[genre_list].sum().reset_index()


In [377]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)


def getKNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [521]:
movies_df = movies_df[genre_list+['size_rating', 'mean_rating', 'movieId']]

In [523]:
testInstance = movies_df[movies_df["movieId"]==231].values.tolist()[0][:-1]

In [524]:
%%time
trainSet = movies_df.values.tolist()
k = 10
neighbors=getKNeighbors(trainSet, testInstance[:-1], k)

CPU times: user 321 ms, sys: 18.7 ms, total: 340 ms
Wall time: 341 ms


In [526]:
movie_title_catalog[231]

'Dumb & Dumber (Dumb and Dumber) (1994)'

In [527]:
for movie in neighbors:
    print(movie_title_catalog[movie[-1]])

Dumb & Dumber (Dumb and Dumber) (1994)
Around the World in 80 Days (1956)
Adventures in Babysitting (1987)
Gods Must Be Crazy, The (1980)
Nothing But Trouble (1991)
Crocodile Dundee (1986)
Crimson Pirate, The (1952)
Pee-wee's Big Adventure (1985)
Me, Myself & Irene (2000)
Breaking In (1989)


In [528]:
movies_df.to_parquet('../models/movie.parquet')

In [529]:
with open('../models/category.pickle', 'wb') as handle:
    pickle.dump(category, handle, protocol=pickle.HIGHEST_PROTOCOL)