<b>A recommendation system using a Neo4j database</b>

MovieLens dataset (https://grouplens.org/datasets/movielens/25m/) was used in this project.

Importing Packages

In [2]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

Getting datasets

In [3]:
genome_scores_data = pd.read_csv("ml-25m/genome-scores.csv") 
movies_data = pd.read_csv("ml-25m/movies.csv") 
ratings_data = pd.read_csv("ml-25m/ratings.csv")

In [4]:
genome_scores_data.head(5)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [5]:
movies_data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings_data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [8]:
users_df = pd.DataFrame(ratings_data['userId'].unique(), columns=['userId'])

In [9]:
users_df.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


In [10]:
movies_df = movies_data.drop('genres', axis = 1)

In [11]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [12]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"]

In [13]:
genres_df = pd.DataFrame(genres, columns=['genres'])

In [14]:
genres_df.head()

Unnamed: 0,genres
0,Action
1,Adventure
2,Animation
3,Children
4,Comedy


In [15]:
users_movies_df = ratings_data.drop('timestamp', axis = 1)

In [16]:
users_movies_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [17]:
movies_genres_df = movies_data.drop('title', axis = 1)

In [18]:
#define a function to split genres field
def get_movie_genres(movieId):
    movie = movies_genres_df[movies_genres_df['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

In [19]:
movies_genres=pd.DataFrame(columns=['movieId','genres'])

In [20]:
#dummy variables for checking time 
a1 = [10,100,1000,3000,5000,10000,15000,20000,25000]
b1 = 0

In [21]:
for x in movies_genres_df['movieId'].tolist():
    b1 += 1
    if b1 in a1: print(b1, str(datetime.datetime.now()))
    movies_genres=movies_genres.append(get_movie_genres(x))

10 2020-11-16 13:10:33.345895
100 2020-11-16 13:10:33.574299
1000 2020-11-16 13:10:35.507142
3000 2020-11-16 13:10:40.138744
5000 2020-11-16 13:10:44.989812
10000 2020-11-16 13:10:58.959391
15000 2020-11-16 13:11:16.655088
20000 2020-11-16 13:11:37.026656
25000 2020-11-16 13:12:01.511250


In [22]:
user_genres_df = ratings_data.merge(movies_data, left_on='movieId', right_on='movieId', how='left')

In [23]:
user_genres_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [None]:
user_genres_df.drop(['movieId','rating','timestamp','title'], axis = 1, inplace=True)

In [None]:
user_genres_df.head()

In [None]:
#define a funtion to get the most genre, it is based on counts of genre per user
def get_popular_genre(userId):
    user = user_genres_df[user_genres_df['userId']==userId]
    genres = user['genres'].tolist()
    movie_list = [b for a in [i.split('|') for i in genres] for b in a]
    counter = Counter(movie_list)
    return counter.most_common(1)[0][0]

In [None]:
get_popular_genre(10)

In [None]:
users_genres = pd.DataFrame(columns=['userId','genre'])

In [None]:
#dummy variables for checking time 
a2 = [10,100,1000,5000,10000,25000,50000,75000,100000, 125000]
b2 = 0

In [None]:
for x in user_df['userId'].tolist():
    b2 += 1
    if b2 in a2: print(b2, str(datetime.datetime.now()))
    users_genres=users_genres.append(pd.DataFrame([[x,get_popular_genre(x)]], columns=['userId','genre']))

In [None]:
users_genres.head()

Similarity calculation

In [None]:
scores_pivot = genome_scores_data.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index()

In [None]:
scores_pivot.head()

In [None]:
mov_tag_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left')

In [None]:
#fill null values and drop columns that are not used
mov_tag_df = mov_tag_df.fillna(0)
mov_tag_df = mov_tag_df.drop(['title','genres'], axis = 1)

In [None]:
mov_tag_df.head()

In [None]:
mov_genres_df = movies_data.drop('title', axis = 1)

In [None]:
mov_genres_df.head()

In [None]:
#define function to set genders column if exists or not
def set_genres(genres,col):
    if genres in col.split('|'): return 1
    else: return 0

In [None]:
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)