In [6]:
import re
import gensim
import numpy as np
import pandas as pd
from scipy import spatial
from lightfm import LightFM
from sklearn.metrics import pairwise_distances
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
I = np.load("interactions_films.npy")

In [3]:
films = pd.read_csv("MovieTitles.csv", delimiter='\t')

In [4]:
I.shape

(480189, 17770)

In [5]:
films.shape

(17770, 3)

In [6]:
films.head()

Unnamed: 0,id,year,title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


# Топ

In [29]:
films_sum = np.sum(I.astype(bool), axis=0)

In [30]:
pd.DataFrame(films.values[np.argsort(films_sum)[::-1][:10]], columns=['id', 'year', 'title'])

Unnamed: 0,id,year,title
0,5317,2000,Miss Congeniality
1,15124,1996,Independence Day
2,14313,2000,The Patriot
3,15205,2004,The Day After Tomorrow
4,1905,2003,Pirates of the Caribbean: The Curse of the Bla...
5,6287,1990,Pretty Woman
6,11283,1994,Forrest Gump
7,16377,1999,The Green Mile
8,16242,1997,Con Air
9,12470,1996,Twister


# Content-based

In [12]:
list_title = [gensim.utils.simple_preprocess(re.sub("[^a-zA-Z]+", " ", i.lower())) 
                           for i in films['title']]

In [13]:
# build vocabulary and train model
model = gensim.models.Word2Vec(
        list_title,
        size=200,
        window=10,
        min_count=1,
        workers=10,
        iter=100)

In [14]:
for i, j in model.wv.most_similar('blood', topn=10):
    print i, j

octane 0.569503247738
barb 0.560724854469
sisters 0.558799505234
xi 0.556424677372
vampires 0.541829705238
guts 0.527166962624
taste 0.522284269333
babette 0.516442656517
grave 0.516346871853
sand 0.514134764671


In [15]:
def avg_feature_vector(words, model, num_features, index2word_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [16]:
index2word_set = set(model.wv.index2word)

In [17]:
film_embs = np.zeros((len(list_title), 200))
for i in range(film_embs.shape[0]):
    film_embs[i] = avg_feature_vector(list_title[i], model, 200, index2word_set)

In [18]:
idx = -1

In [19]:
pd.DataFrame([' '.join(i) for i in np.array(list_title)[I[idx] > 0]], columns=['title'])

Unnamed: 0,title
0,shakespeare in love


In [20]:
pd.DataFrame([' '.join(i) for i in np.array(list_title)[np.argsort(cosine_similarity(film_embs,
                  [avg_feature_vector(sum(list(np.array(list_title)[I[idx] > 0]), []),
                                      model,
                                      200,
                                      index2word_set)]).ravel())[::-1][:10]]], columns=['film name'])[1:]

Unnamed: 0,film name
1,juliet in love
2,swann in love
3,love in sampan
4,love in thoughts
5,falling in love
6,love in cold climate
7,thomas in love
8,young doctors in love
9,women in love


# Коллаборативная фильтрация

In [21]:
csr_I = csr_matrix(I)

In [22]:
m = cosine_similarity(I[0].reshape(1, -1), csr_I).reshape(-1, 1)

In [23]:
def recs(idx, rates, csr_rates):
    # считаем косинус между всеми пользователями
    metrics = cosine_similarity(I[idx].reshape(1, -1), csr_rates).reshape(-1, 1)
    # домножаем оценки пользовтеля на коэффициент похожести
    rates *= metrics
    # чтобы не рекомендовать уже просмотренные - зануляем веса просмотренных
    total_rate = (1 - rates[idx].astype(bool)) * np.sum(rates, axis=0)
    # печатаем рекомендацию. если хотим рекомендовать несколько фильмов - делаем argsort вместо argmax
    index_of_watched_movies = rates[idx].astype(bool)
    return (pd.DataFrame(np.hstack((films['title'][index_of_watched_movies].values.reshape(-1, 1),
                                rates[idx][index_of_watched_movies].reshape(-1, 1))), columns=['film', 'rate']), 
            pd.DataFrame(films['title'].values[np.argsort(total_rate)[::-1][:5]], columns = ['film']))

In [24]:
idx = 10
watched, recomends = recs(idx, I, csr_I)

In [25]:
print u"Пользователь {} видел следующие фильмы:".format(idx)
watched

Пользователь 10 видел следующие фильмы:


Unnamed: 0,film,rate
0,Dinosaur Planet,4
1,Bridget Jones's Diary,3
2,Princess Mononoke,5
3,Blue Planet: IMAX,3
4,When Dinosaurs Roamed America,4
5,Aimee and Jaguar,3
6,Walking with Dinosaurs,5
7,Scooby-Doo Meets Batman,3
8,Eternal Sunshine of the Spotless Mind,4
9,Being John Malkovich,3


In [34]:
print u"Рекомендуем посмотреть пользователю {} фильмы".format(idx)
recomends

Рекомендуем посмотреть пользователю 10312 фильмы


Unnamed: 0,film
0,Pirates of the Caribbean: The Curse of the Bla...
1,Forrest Gump
2,Lord of the Rings: The Two Towers
3,Lord of the Rings: The Fellowship of the Ring
4,The Green Mile


# Матричная факторизация

In [7]:
# Instantiate and train the model
model = LightFM(loss='warp', no_components=100, learning_rate=0.03, learning_schedule="adadelta")

In [8]:
model.fit(coo_matrix(I), epochs=5, num_threads=40, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


<lightfm.lightfm.LightFM at 0x7fdbc106d250>

In [9]:
user_feature_bias, user_feature_embeddings = model.get_user_representations()
item_feature_bias, item_feature_embeddings = model.get_item_representations()

In [27]:
item_feature_embeddings.shape

(17770, 100)

In [33]:
idx = 10312 #, 11520, 12545, 11444, 408, 14617, 8742, 797, 15852, 14927

metrics = (1 - pairwise_distances(item_feature_embeddings[idx].reshape(1, -1),
                                  item_feature_embeddings, 
                                  metric='cosine'))

for i in np.argsort(metrics)[0][::-1][:10]:
    print films['title'][i]

Lord of the Rings: The Fellowship of the Ring: Bonus Material
Lord of the Rings: The Two Towers: Bonus Material
Lord of the Rings: The Return of the King: Bonus Material
Harry Potter and the Chamber of Secrets: Bonus Material
X-Men: Bonus Material
The Matrix: Reloaded: Bonus Material
Pirates of the Caribbean: The Curse of the Black Pearl: Bonus Material
Spider-Man: Bonus Material
Harry Potter and the Sorcerer's Stone: Bonus Material
Star Wars: Episode II: Attack of the Clones: Bonus Material
