In [1]:
import numpy as np
import pandas as pd
from collections import Counter

# Cleaning and preprocessing

In [2]:
df = pd.read_csv("vodclickstream_uk_movies_03.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [4]:
df.shape

(671736, 8)

## datetime

In [5]:
len(df[df["datetime"].isna()])

0

In [6]:
min(df["datetime"].unique())

'2017-01-01 00:02:21'

In [7]:
max(df["datetime"].unique())

'2019-06-30 23:59:20'

## duration

In [8]:
len(df[df["duration"].isna()])

0

In [9]:
min(df["duration"].unique())

-1.0

In [10]:
max(df["duration"].unique())

18237253.0

In [11]:
df = df[df["duration"] >= 0]

## title

In [12]:
len(df[df["title"].isna()])

0

In [13]:
len(df[df["title"]==" "])

0

In [14]:
len(df["title"].unique())

7874

## genres

In [15]:
len(df[df["genres"].isna()])

0

In [16]:
len(df[df["genres"]==" "])

0

In [17]:
df = df[df["genres"] != "NOT AVAILABLE"]

## release_date

In [18]:
len(df[df["release_date"].isna()])

0

In [19]:
min(df["release_date"].unique())

'1920-10-01'

In [20]:
max(df["release_date"].unique())

'NOT AVAILABLE'

In [21]:
#Get only the data since Netflix was created
df = df[df["release_date"] >= "2007-01-16"]
df = df[df["release_date"] != "NOT AVAILABLE"]

## movie_id

In [22]:
len(df[df["movie_id"].isna()])

0

In [23]:
len(df["movie_id"].unique())

5442

## user_id

In [24]:
len(df[df["user_id"].isna()])

0

In [25]:
len(df["user_id"].unique())

137665

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
5,58778,2017-01-01 19:21:37,0.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,c5bf4f3f57


# Recommendation system

In [27]:
#MIRAAR
distinct_movie_genre = df[['movie_id', 'genres', "title"]].drop_duplicates()
clicks = df.groupby(['user_id', 'movie_id']).size().reset_index(name='Number_of_clicks')
movie_genre = clicks.sort_values(by='Number_of_clicks', ascending=False).groupby('user_id').head(10).reset_index(drop=True)
movie_genre = pd.merge(movie_genre, distinct_movie_genre, on = "movie_id", how = "left")
movie_genre["genres"] = movie_genre["genres"].str.split(", ")

In [28]:
movie_genre.head()

Unnamed: 0,user_id,movie_id,Number_of_clicks,genres,title
0,7cdfd0e14a,40bccd3001,88,"[Drama, Fantasy, Romance]",Twilight
1,e06f0be797,3f3b34e56f,54,"[Action, Comedy, Crime, Thriller]",Rush Hour 3
2,59416738c3,cbdf9820bc,54,"[Comedy, Romance]",The Ex
3,49d091aa63,b8a2658c23,48,"[Comedy, Romance, Sport]",Chalet Girl
4,3675d9ba4a,948f2b5bf6,42,"[Drama, Romance, Sci-Fi, Thriller]",Passengers


In [29]:
list_of_genres = list(movie_genre["genres"])
unique_genres = set(genre for genres in list_of_genres for genre in genres)
unique_genres_list = list(unique_genres)
genre_dict = {genre: i for i, genre in enumerate(sorted(unique_genres_list))}

In [30]:
users = movie_genre["user_id"].unique()
users_dict = {user: i for i, user in enumerate(sorted(users))}
inverted_users_dict = {value: key for key, value in users_dict.items()}

In [31]:
def matrix_representation_users_genres(users_dict, genre_dict, movie_genre):
    df_shape = movie_genre.shape[0]
    rows = len(genre_dict)
    cols = len(users_dict)
    matrix_representation = np.zeros((rows, cols), dtype = int)
    
    for i in range(df_shape):    
        user = movie_genre.iloc[i][0]
        genres = movie_genre.iloc[i][3]
        for genre in genres:
            matrix_representation[genre_dict[genre], users_dict[user]] = 1
    return matrix_representation

In [32]:
def signature_matrix_minhash(n_hashes, hash_function, matrix_representation):
    np.random.seed(41)
    cols = len(matrix_representation[0])
    signature_matrix = np.full((n_hashes, cols), np.inf)
    a_b = [(round(np.random.uniform(0, 97)), round(np.random.uniform(0, 97))) for _ in range(n_hashes)]
    for r in range(len(matrix_representation)):

        hashes = [hash_function(a_b[i][0], r, a_b[i][1]) for i in range(n_hashes)]

        cols_with_one = list(np.nonzero(matrix_representation[r])[0])

        for col in cols_with_one:
            for h in range(n_hashes):
                if signature_matrix[h, col] > hashes[h]:
                    signature_matrix[h, col] = hashes[h]
    return signature_matrix

In [33]:
#MIRAAAAAR
def hashing_function(bucket):
    hash_result = 0
    prime = 31  # Puedes elegir cualquier número primo

    for elemento in bucket:
        elemento_hash = hash(elemento)
        
        hash_result = hash_result * prime + elemento_hash

    return hash_result

In [34]:
def lsh(signature_matrix, rows, inverted_dict):
    buckets = {}
    signature_matrix = signature_matrix.T
    for index, row in enumerate(signature_matrix):
        for n in range(0, len(row),rows):
            band = row[n:n+rows]
            hashed_value = hashing_function(band)
            if hashed_value in buckets:
                buckets[hashed_value].append(inverted_dict[index])
            else:
                buckets[hashed_value] = [inverted_dict[index]]
    return buckets

In [35]:
def most_common_user(user_id, bucket):
    buckets_user = []
    users = []
    for bucket in buckets.values():
        if user_id in bucket:
            buckets_user.append(bucket)
    for bucket in buckets_user:
        users.append(bucket)
    users = list(np.concatenate(users))
    counts = Counter(users).most_common()
    return (counts[0][0], counts[1][0])

In [38]:
def get_films(common_users, movie_genre):
    user1 = common_users[0]
    user2 = common_users[1]
    final_df = pd.DataFrame(columns=["user_id", "title", "Number_of_clicks"])
    films_to_show = 5

    # 1. Movies in common based on number_clicks
    df_movies1 = movie_genre[(movie_genre["user_id"] == user1) & (movie_genre["user_id"] == user2)]
    df_movies1 = df_movies1.groupby("movie_id")["Number_of_clicks"].sum().reset_index()
    df_movies1 = df_movies1.sort_values(by="Number_of_clicks", ascending=False)
    if not df_movies1.empty:
        final_df = pd.concat([final_df, df_movies1[["user_id", "title", "Number_of_clicks"]]])

    # 2. Most clicked movies by the first user
    df_movies2 = movie_genre[movie_genre["user_id"] == user1].sort_values(by="Number_of_clicks", ascending=False)
    if not df_movies2.empty:
        final_df = pd.concat([final_df, df_movies2[["user_id", "title", "Number_of_clicks"]]])

    # 3. Most clicked movies by the second user
    df_movies3 = movie_genre[movie_genre["user_id"] == user2].sort_values(by="Number_of_clicks", ascending=False)
    if not df_movies3.empty:
        final_df = pd.concat([final_df, df_movies3[["user_id", "title", "Number_of_clicks"]]])

    return final_df.drop_duplicates()

In [41]:
matrix_representation = matrix_representation_users_genres(users_dict, genre_dict, movie_genre)
n_hashes = 20
hash_function = lambda a, x, b : (a * x + b) % 31
signature_matrix = signature_matrix_minhash(n_hashes, hash_function, matrix_representation)
buckets = lsh(signature_matrix, 4, inverted_users_dict)
user_id = "49d091aa63"
mc_users = most_common_user(user_id, buckets)
df_films = get_films(mc_users, movie_genre)
df_films.head(5)[["user_id", "title"]]

Unnamed: 0,user_id,title
253803,04ebe56321,Set It Up
253808,04ebe56321,Bring It On: Fight to the Finish
253215,05a954a3a0,Anchorman: The Legend Continues
253216,05a954a3a0,The Love Guru
