In [66]:
import numpy as np
import pandas as pd

# Cleaning and preprocessing

In [2]:
df = pd.read_csv("vodclickstream_uk_movies_03.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [4]:
df.shape

(671736, 8)

## datetime

In [5]:
len(df[df["datetime"].isna()])

0

In [6]:
min(df["datetime"].unique())

'2017-01-01 00:02:21'

In [7]:
max(df["datetime"].unique())

'2019-06-30 23:59:20'

## duration

In [8]:
len(df[df["duration"].isna()])

0

In [9]:
min(df["duration"].unique())

-1.0

In [10]:
max(df["duration"].unique())

18237253.0

In [11]:
df = df[df["duration"] >= 0]

## title

In [12]:
len(df[df["title"].isna()])

0

In [13]:
len(df[df["title"]==" "])

0

In [14]:
len(df["title"].unique())

7874

## genres

In [15]:
len(df[df["genres"].isna()])

0

In [16]:
len(df[df["genres"]==" "])

0

In [17]:
len(df["genres"].unique())

1184

In [18]:
df = df[df["genres"] != "NOT AVAILABLE"]

## release_date

In [19]:
len(df[df["release_date"].isna()])

0

In [20]:
min(df["release_date"].unique())

'1920-10-01'

In [21]:
max(df["release_date"].unique())

'NOT AVAILABLE'

In [22]:
#Get only the data since Netflix was created
df = df[df["release_date"] >= "2007-01-16"]
df = df[df["release_date"] != "NOT AVAILABLE"]

## movie_id

In [23]:
len(df[df["movie_id"].isna()])

0

In [24]:
len(df["movie_id"].unique())

5442

## user_id

In [25]:
len(df[df["user_id"].isna()])

0

In [26]:
len(df["user_id"].unique())

137665

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
5,58778,2017-01-01 19:21:37,0.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,c5bf4f3f57


# Recommendation system

In [41]:
rows_per_film = df.groupby('movie_id').size().reset_index(name='Number_of_clicks')
rows_per_fil = rows_per_film[rows_per_film["Number_of_clicks"] > 10]
movie_genre = pd.merge(rows_per_film, df, on = "movie_id", how = "inner")
movie_genre = movie_genre[["user_id", "genres"]].drop_duplicates()
movie_genre["genres"] = movie_genre["genres"].apply(lambda x : x.split(", "))

In [44]:
movie_genre

Unnamed: 0,user_id,genres
0,96a9600666,"[Documentary, Comedy]"
1,065831ce4a,"[Documentary, Comedy]"
2,d6cee7e9af,"[Documentary, Comedy]"
4,33f0d36bd3,"[Documentary, Comedy]"
8,18c10ac8e5,"[Action, Crime, Drama, Thriller]"
...,...,...
510852,d171a73b89,"[Action, Adventure, Comedy, Fantasy, Sci-Fi, T..."
510853,7c3588e342,"[Action, Adventure, Comedy, Fantasy, Sci-Fi, T..."
510854,65c971bdec,"[Action, Adventure, Comedy, Fantasy, Sci-Fi, T..."
510855,10a9415b24,"[Action, Adventure, Comedy, Fantasy, Sci-Fi, T..."


In [61]:
list_of_genres = list(movie_genre["genres"])
unique_genres = set(genre for genres in list_of_genres for genre in genres)
unique_genres_list = list(unique_genres)
unique_genres_list

['War',
 'Action',
 'Talk-Show',
 'Horror',
 'Thriller',
 'Documentary',
 'Reality-TV',
 'History',
 'Biography',
 'Music',
 'Family',
 'Animation',
 'Comedy',
 'Sport',
 'News',
 'Fantasy',
 'Short',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Musical',
 'Mystery',
 'Crime',
 'Western']

In [62]:
# Create a dictionary with genre as key and corresponding number as value
genre_dict = {genre: i for i, genre in enumerate(sorted(unique_genres_list))}

# Print the resulting dictionary
print(genre_dict)

{'Action': 0, 'Adventure': 1, 'Animation': 2, 'Biography': 3, 'Comedy': 4, 'Crime': 5, 'Documentary': 6, 'Drama': 7, 'Family': 8, 'Fantasy': 9, 'History': 10, 'Horror': 11, 'Music': 12, 'Musical': 13, 'Mystery': 14, 'News': 15, 'Reality-TV': 16, 'Romance': 17, 'Sci-Fi': 18, 'Short': 19, 'Sport': 20, 'Talk-Show': 21, 'Thriller': 22, 'War': 23, 'Western': 24}


In [63]:
users = movie_genre["user_id"].unique()
users_dict = {user: i for i, user in enumerate(sorted(users))}

In [64]:
users_dict

{'000052a0a0': 0,
 '000090e7c8': 1,
 '000296842d': 2,
 '0002aab109': 3,
 '0002abf14f': 4,
 '0002d1c4b1': 5,
 '0005c8fbac': 6,
 '0005d9a8f4': 7,
 '0006b97ace': 8,
 '0006e547fc': 9,
 '0006ea6b5c': 10,
 '00071a0dfb': 11,
 '0007fc8621': 12,
 '000800c223': 13,
 '00087625aa': 14,
 '0008c31833': 15,
 '0008d919a5': 16,
 '000a3ec774': 17,
 '000b048ccf': 18,
 '000b217ed0': 19,
 '000b3fbc4e': 20,
 '000b4a3b02': 21,
 '000c785b0a': 22,
 '000ce80153': 23,
 '000de1ca63': 24,
 '000f073c97': 25,
 '000fd42216': 26,
 '0010181969': 27,
 '001067d167': 28,
 '0011f2035b': 29,
 '00124b8483': 30,
 '0012a3424e': 31,
 '0012a95d5f': 32,
 '0012bde289': 33,
 '0012daeac7': 34,
 '0013a2953e': 35,
 '0013d3412e': 36,
 '0014db9c5e': 37,
 '0016c962c8': 38,
 '001723de35': 39,
 '00173f2eaf': 40,
 '00176a8729': 41,
 '0018190564': 42,
 '00184fcc7f': 43,
 '001867cb3c': 44,
 '001916d8b3': 45,
 '0019989273': 46,
 '0019a31164': 47,
 '0019b29bcf': 48,
 '001b006cf8': 49,
 '001b9ab1df': 50,
 '001bc7d2af': 51,
 '001c212529': 52,
 '0

In [207]:
rows = len(genre_dict)
cols = len(users_dict)
df_shape = movie_genre.shape[0]
matrix_representation = np.zeros((rows, cols + 1), dtype = int)

In [208]:
for i in range(df_shape):    
    user = movie_genre.iloc[i][0]
    genres = movie_genre.iloc[i][1]
    for genre in genres:
        matrix_representation[genre_dict[genre], users_dict[user]] = 1

In [209]:
n_hashes = 2
rows_list = np.arange(0, rows)
hash1 = lambda x : (58568 * x + 83283) % rows
hash2 = lambda x : (34218 * x + 78586) % rows
hash1(rows_list)
hash2(rows_list)

array([11,  4, 22, 15,  8,  1, 19, 12,  5, 23, 16,  9,  2, 20, 13,  6, 24,
       17, 10,  3, 21, 14,  7,  0, 18], dtype=int32)

In [216]:
signature_matrix = np.full((2, cols), np.inf)

In [211]:
signature_matrix

array([[inf, inf, inf, ..., inf, inf, inf],
       [inf, inf, inf, ..., inf, inf, inf]])

In [212]:
for r in range(len(matrix_representation)):
    h1 = hash1(r)
    h2 = hash2(r)
    
    cols_with_one = list(np.nonzero(matrix_representation[r])[0])
    
    for col in cols_with_one:
        if signature_matrix[0, col] > h1:
            signature_matrix[0, col] = h1
        elif signature_matrix[1, col] > h2:
            signature_matrix[1, col] = h2

In [214]:
signature_matrix

array([[ 1.,  4.,  4., ...,  4.,  5.,  4.],
       [ 1., inf, 13., ..., inf,  0., 13.]])