### Citation

#### Dataset retrived from: F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. https://doi.org/10.1145/2827872




In [1]:
# Import libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from lightfm import LightFM, cross_validation
from lightfm.data import Dataset



In [2]:
ratings_df = pd.read_csv('ratings.csv')
links_df = pd.read_csv('links.csv')
movies_df = pd.read_csv('movies.csv')
# tags_df = pd.read_csv('tags.csv')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df = pd.merge(ratings_df, movies_df, how='left', on=['movieId'])

In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [8]:
df = df[['userId','movieId','rating','genres']]

In [9]:
df.head()

Unnamed: 0,userId,movieId,rating,genres
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Comedy|Romance
2,1,6,4.0,Action|Crime|Thriller
3,1,47,5.0,Mystery|Thriller
4,1,50,5.0,Crime|Mystery|Thriller


In [10]:
df.shape

(100836, 4)

In [11]:
ratings_df.shape

(100836, 4)

In [13]:
df = df[df['rating'] > 4.0]

In [14]:
dataset = Dataset()

In [15]:
dataset.fit(users=df['userId'],
           items=df['movieId'])

In [16]:
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 598, num_topics: 4056.


In [17]:
# dataset.fit(
#     ratings_df['userId'],
#     movies_df['movieId'],
#     item_features=movies_df['genres']
# )

# interactions, _ = dataset.build_interactions((row['userId'], row['movieId']) for _, row in ratings_df.iterrows())

# item_features = dataset.build_item_features((row['movieId'], [row['genres']]) for _, row in movies_df.iterrows())

In [18]:
chunksize = 10000

In [19]:
(interactions, weights) = dataset.build_interactions(df.iloc[:, 0:3].values)

In [20]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=0.4,
    random_state=np.random.RandomState(42))

In [21]:
model = LightFM(learning_rate=0.05, loss='warp')

In [None]:
model.fit_partial(interactions=train_interactions)