In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
movies.shape

(9742, 3)

In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
tags.shape

(3683, 4)

In [7]:
df = pd.merge(movies, tags, on = 'movieId')

In [8]:
df.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [9]:
df["bar"] = df.apply(lambda row: row['genres'] + ' ' + row['tag'], axis=1) 

In [10]:
def change_string(s):
    return ' '.join(s.replace(' ', ' ').replace('-', ' ').split('|'))

In [11]:
movie_genres = [change_string(g) for g in df.tag.values]

In [12]:
movie_genres[:50]

['pixar',
 'pixar',
 'fun',
 'fantasy',
 'magic board game',
 'Robin Williams',
 'game',
 'moldy',
 'old',
 'pregnancy',
 'remake',
 'remake',
 'politics',
 'president',
 'politics',
 'president',
 'Mafia',
 'Jane Austen',
 'Hollywood',
 'serial killer',
 'alcoholism',
 'Shakespeare',
 'In Netflix queue',
 'Jane Austen',
 'kidnapping',
 'high school',
 'teacher',
 'time travel',
 'time travel',
 'Brad Pitt',
 'Bruce Willis',
 'mindfuck',
 'Post apocalyptic',
 'post apocalyptic',
 'remake',
 'time travel',
 'twist ending',
 'Animal movie',
 'pigs',
 'villain nonexistent or not needed for good story',
 'death penalty',
 'Nun',
 'twins',
 'chick flick',
 'funny',
 'Paul Rudd',
 'quotable',
 'seen more than once',
 'Emma',
 'Jane Austen']

In [13]:
tit = 'Godfather, The (1972)'

In [25]:
vector = str(df[df['title'] == tit].bar)

In [18]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [19]:
X_train_counts

<3683x1744 sparse matrix of type '<class 'numpy.int64'>'
	with 5598 stored elements in Compressed Sparse Row format>

In [20]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [21]:
X_train_tfidf

<3683x1744 sparse matrix of type '<class 'numpy.float64'>'
	with 5598 stored elements in Compressed Sparse Row format>

In [38]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='cosine') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [39]:
test = change_string(vector)

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [40]:
res

(array([[0.49073663, 0.49073663, 0.49073663, 0.53931935, 0.53931935,
         0.53931935, 0.53931935]]),
 array([[274, 191, 528, 846, 454, 893,  16]]))

In [41]:
df.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,bar
274,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,599,drama,1498456561,Comedy|Crime|Drama|Thriller drama
191,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,599,drama,1498456149,Action|Crime|Drama|Thriller drama
528,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,477,drama,1241396391,Crime|Horror|Thriller drama
846,1213,Goodfellas (1990),Crime|Drama,474,Mafia,1138137920,Crime|Drama Mafia
454,431,Carlito's Way (1993),Crime|Drama,18,mafia,1462138755,Crime|Drama mafia
893,1245,Miller's Crossing (1990),Crime|Drama|Film-Noir|Thriller,474,Mafia,1137375832,Crime|Drama|Film-Noir|Thriller Mafia
16,16,Casino (1995),Crime|Drama,474,Mafia,1137181640,Crime|Drama Mafia
