In [2]:
### IMPORTS ###

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

## Analysis of movies

In [3]:
# load movie data
movies = pd.read_pickle('data/imdb/ml_movies_description.pkl')

movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,114709,A cowboy doll is profoundly threatened and jea...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,113497,When two kids find and play a magical board ga...
2,6,Heat (1995),Action|Crime|Thriller,113277,949.0,113277,A group of high-end professional thieves start...
3,8,Tom and Huck (1995),Adventure|Children,112302,45325.0,112302,Two best friends witness a murder and embark o...
4,9,Sudden Death (1995),Action,114576,9091.0,114576,A former fireman takes on a group of terrorist...


In [4]:
# get a list of unique movie ids
movie_ids_imdb = movies['movieId'].unique()

# print the number of unique movie ids
print('Number of unique movie ids: ', len(movie_ids_imdb))

Number of unique movie ids:  51860


## Analysis of ratings

In [5]:
# load ratings
ratings = pd.read_csv('data/ml-32m/ratings.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [6]:
# get a list of unique user ids
movie_ids_ml = ratings['movieId'].unique()

# print the number of unique user ids
print('Number of unique user ids: ', len(movie_ids_ml))

Number of unique user ids:  84432


In [7]:
### DATA PREPROCESSING ###

# get the intersection of movie ids
movie_ids = np.intersect1d(movie_ids_imdb, movie_ids_ml)
print('Number of common movie ids: ', len(movie_ids))

# filter the ratings data to contain only the common movie ids
ratings = ratings[ratings['movieId'].isin(movie_ids)]

# drop timestamp column
ratings = ratings.drop('timestamp', axis=1)

# print the number of ratings
print('Number of ratings: ', len(ratings))

# get the number of unique users
user_ids = ratings['userId'].unique()

# print the number of unique users
print('Number of unique users: ', len(user_ids))

Number of common movie ids:  49892
Number of ratings:  25723135
Number of unique users:  200947


In [8]:
### CREATE LIKED MOVIES DATAFRAME ###
# create a dataframe to only store ratings of 4 or 5
liked_movies = ratings[ratings['rating'] >= 4]

# print the number of liked movies
print('Number of ratings 4 or 5: ', len(liked_movies))

liked_movies.head()



Number of ratings 4 or 5:  12782088


Unnamed: 0,userId,movieId,rating
3,1,30,5.0
4,1,32,5.0
9,1,111,5.0
11,1,166,5.0
15,1,260,5.0


In [9]:
### Create basket data ###
# create a basket data
baskets = liked_movies.groupby('userId')['movieId'].apply(list).reset_index(name='basket')
items = np.unique(np.concatenate(baskets['basket']))

In [10]:
# create a a DataFrame to hash items to integers
#df_item_hash = pd.DataFrame({'item': items, 'hashcode': range(len(items))}).set_index('item')
df_item_hash = pd.DataFrame(range(len(items)), index=items, columns=['hashcode'])

df_item_hash.head()

Unnamed: 0,hashcode
1,0
2,1
6,2
8,3
9,4


In [11]:
# Count the occurrences of each item and store in an array
### SLOW ###
#item_count_arr = np.zeros((len(items), 1))
#for basket in tqdm(baskets['basket']):
#    for item in basket:
#        idx = df_item_hash.loc[item, 'hashcode']
#        item_count_arr[idx] += 1

# Count the occurrences of each item and store in an array
item_count_arr = np.zeros((len(items), 1))

# Flatten the list of baskets and get the corresponding hashcodes
flattened_baskets = np.concatenate(baskets['basket'].values)
hashcodes = df_item_hash.loc[flattened_baskets, 'hashcode'].values

# Use np.bincount to count occurrences of each hashcode
item_count_arr[:len(np.bincount(hashcodes))] = np.bincount(hashcodes).reshape(-1, 1)

In [12]:
# find frequent items (items that appear in more than 0.5% of the baskets)
freq_items = np.array([df_item_hash[df_item_hash['hashcode'] == x].index[0] for x in tqdm(np.where(item_count_arr > 0.005 * len(baskets))[0])])
freq_items

100%|██████████| 1965/1965 [00:00<00:00, 17884.62it/s]


array([     1,      2,      6, ..., 270698, 274053, 278702])

In [13]:
### THIS IS SLOW ###

### hash the frequent items (starting from 1)
#df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])

# create a matrix to store the pair counts
#pair_mat_hashed = np.zeros((len(freq_items)+1, len(freq_items)+1))

#for b in tqdm(baskets['basket']):
#    cand_list = [item for item in b if item in freq_items]
#    if len(cand_list)<2:
#        continue
#    for idx, item1 in enumerate(cand_list):
#        for item2 in cand_list[idx+1:]:
#            i = df_freq_item_hash.loc[item1,'hashcode'] 
#            j = df_freq_item_hash.loc[item2,'hashcode'] 
#            pair_mat_hashed[max(i,j),min(i,j)]+=1

# pair_mat
#pair_mat_hashed

In [14]:
# Create a matrix to store the pair counts
pair_mat_hashed = np.zeros((len(freq_items)+1, len(freq_items)+1))

# Create a dictionary for quick lookup of hashcodes
df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])

hashcode_dict = df_freq_item_hash['hashcode'].to_dict()

for b in tqdm(baskets['basket']):
    # Filter items in the basket to only those in hashcode_dict
    cand_list = [hashcode_dict[item] for item in b if item in hashcode_dict]
    if len(cand_list) < 2:
        continue
    
    # Convert cand_list to a numpy array
    cand_list = np.array(cand_list)
    
    # Get unique pairs of indices
    i_indices = np.maximum.outer(cand_list, cand_list)
    j_indices = np.minimum.outer(cand_list, cand_list)
    
    # Increment only unique pairs (i, j) where i > j
    unique_pairs = np.triu_indices_from(i_indices, k=1)
    pair_mat_hashed[i_indices[unique_pairs], j_indices[unique_pairs]] += 1

# Display the pair matrix
pair_mat_hashed


100%|██████████| 200612/200612 [00:23<00:00, 8418.15it/s] 


array([[   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0., 4920.,    0., ...,    0.,    0.,    0.],
       ...,
       [   0.,  976.,  319., ...,    0.,    0.,    0.],
       [   0.,  724.,  241., ...,  707.,    0.,    0.],
       [   0.,  417.,  176., ...,  601.,  358.,    0.]])

In [None]:
# Get frequent pairs (pairs that appear in more than 0.05% of the baskets - support = 0.0005)
# Extract frequent pairs that exceed support s2 (assume s2 = 0.02), and hash back.

# Find indices where pair counts exceed the threshold
pair_indices = np.where(pair_mat_hashed > 0.05 * len(baskets))

# Extract frequent pairs and hash back
freq_pairs = np.array([
    [df_freq_item_hash.index[x-1], df_freq_item_hash.index[y-1]]
    for x, y in zip(pair_indices[0], pair_indices[1])
])

print('Number of frequent pairs: ', len(freq_pairs))

# make this into a list of tuples
freq_pairs = [tuple(x) for x in freq_pairs]

(array([  15,   21,   21, ..., 1832, 1832, 1877]), array([   1,    1,    3, ..., 1616, 1770, 1616]))
Number of frequent pairs:  2223


In [None]:
# calculate the support of each frequent pair
pair_support = pair_mat_hashed[pair_indices] / len(baskets)

# also calculate the confidence of each frequent pair (confidence = support(pair) / support(item1))


IndexError: index 2011 is out of bounds for axis 0 with size 1965

In [59]:
# create a DataFrame to store the frequent pairs and their support
df_freq_pairs = pd.DataFrame(freq_pairs, columns=['item1', 'item2'])
df_freq_pairs['support'] = pair_support.flatten()

# make one column called itemsets that contains the frequent pairs as tuples (frozen sets)
df_freq_pairs['itemsets'] = df_freq_pairs[['item1', 'item2']].apply(lambda x: frozenset(x), axis=1)

# drop the item1 and item2 columns
df_freq_pairs = df_freq_pairs.drop(['item1', 'item2'], axis=1)

df_freq_pairs.head()


Unnamed: 0,support,itemsets
0,0.06869,"(32, 1)"
1,0.079522,"(1, 47)"
2,0.054937,"(6, 47)"
3,0.096455,"(32, 47)"
4,0.086266,"(1, 50)"


In [33]:
### ASSOCIATION RULE MINING WITH APRIORI ###
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [35]:
df_subset = liked_movies
transactions = df_subset.groupby('userId')['movieId'].apply(list).values

# Use TransactionEncoder to convert the data into a binary matrix
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
binary_df = pd.DataFrame(te_ary, columns=te.columns_)

In [41]:
# frequent itemsets
min_support = 0.1
frequent_itemsets = apriori(binary_df, min_support=min_support, use_colnames=True)
print("Frequent Itemsets (with more than one element):")
print(frequent_itemsets)

Frequent Itemsets (with more than one element):
      support                   itemsets
0    0.226896                        (1)
1    0.184765                       (32)
2    0.237977                       (47)
3    0.276030                       (50)
4    0.239422                      (110)
..        ...                        ...
394  0.114151    (5952, 4993, 260, 7153)
395  0.103812    (296, 4993, 5952, 7153)
396  0.110781   (5952, 4993, 1196, 7153)
397  0.140620   (5952, 7153, 4993, 2571)
398  0.105966  (5952, 4993, 7153, 58559)

[399 rows x 2 columns]


# Generating association rules for movies

In [None]:
# Generate association rules 
rules_apriori = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5, num_itemsets=len(transactions))

  jaccard_metric = numerator / denominator


In [50]:
rules_apriori

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(1),(260),0.226896,0.318321,0.120332,0.530340,1.666053,1.0,0.048106,1.451430,0.517109,0.283210,0.311024,0.454180
1,(32),(296),0.184765,0.385710,0.122724,0.664221,1.722074,1.0,0.051459,1.829446,0.514336,0.274092,0.453386,0.491199
2,(32),(593),0.184765,0.351689,0.104984,0.568203,1.615640,1.0,0.040004,1.501425,0.467411,0.243317,0.333966,0.433358
3,(47),(50),0.237977,0.276030,0.135675,0.570118,2.065418,1.0,0.069986,1.684112,0.676930,0.358613,0.406215,0.530820
4,(47),(296),0.237977,0.385710,0.169586,0.712616,1.847545,1.0,0.077796,2.137524,0.602004,0.373455,0.532169,0.576144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,"(5952, 7153, 58559)",(4993),0.111753,0.272137,0.105966,0.948214,3.484321,1.0,0.075554,14.055086,0.802705,0.381275,0.928851,0.668799
503,"(4993, 7153, 58559)",(5952),0.114624,0.248544,0.105966,0.924462,3.719503,1.0,0.077477,9.948025,0.825804,0.411993,0.899478,0.675404
504,"(5952, 58559)","(4993, 7153)",0.119878,0.218078,0.105966,0.883945,4.053350,1.0,0.079823,6.737531,0.855893,0.456768,0.851578,0.684927
505,"(4993, 58559)","(5952, 7153)",0.127161,0.212500,0.105966,0.833320,3.921512,1.0,0.078944,4.724631,0.853532,0.453436,0.788343,0.665992


In [61]:
len(rules_apriori)

507