In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

In [2]:
data_movies = pd.read_csv('ml-32m/movies.csv')
data_ratings = pd.read_csv('ml-32m/ratings.csv', on_bad_lines='skip')

In [3]:
data_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
data_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [5]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [6]:
data_movies.movieId.value_counts(),print('Number of duplicated unique ids are: ',data_movies.movieId.duplicated().sum())

Number of duplicated unique ids are:  0


(movieId
 292757    1
 292427    1
 292467    1
 292469    1
 292471    1
          ..
 5         1
 6         1
 7         1
 8         1
 9         1
 Name: count, Length: 87585, dtype: int64,
 None)

In [7]:
data_ratings.movieId.value_counts()

movieId
318       102929
356       100296
296        98409
2571       93808
593        90330
           ...  
206505         1
229167         1
138948         1
121961         1
137537         1
Name: count, Length: 84432, dtype: int64

In [8]:
# Filter for high ratings (>=4)
high_ratings = data_ratings[data_ratings['rating'] >= 4]

In [9]:
high_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
3,1,30,5.0,944249077
4,1,32,5.0,943228858
7,1,80,5.0,944248943
9,1,111,5.0,944249008
...,...,...,...,...
32000195,200948,72998,5.0,1350423889
32000196,200948,74458,4.5,1350423822
32000197,200948,76093,5.0,1287223498
32000199,200948,79702,4.5,1294412589


In [10]:
merge_df = data_movies.merge(high_ratings,on = 'movieId',how = 'inner')
merge_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.0,1027305751
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,20,5.0,1553184230
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0,945353745
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,28,4.0,961438127
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,33,5.0,1226362069
...,...,...,...,...,...,...
15938226,292607,The Invention of the Other (2022),Documentary,105979,4.5,1696931152
15938227,292609,The Wind Blows the Border (2022),Documentary,105979,4.0,1696931227
15938228,292611,Kenya (2022),Documentary,105979,4.0,1696931259
15938229,292731,The Monroy Affaire (2022),Drama,50685,4.0,1697074779


In [11]:
merge_df.drop(columns=['timestamp'], inplace=True)

Movie-to-Movie Association Rules

In [12]:
# Group movies by user and prepare for Transaction Encoding
user_movies = merge_df.groupby(by="userId")["title"].apply(list).tolist()

In [13]:
# Transaction encoding for movie titles
te = TransactionEncoder()
te_movies = te.fit(user_movies).transform(user_movies)
df_movies = pd.DataFrame(te_movies, columns=te.columns_)

In [14]:
# FP-Growth with a chosen min_support
min_support = 0.08  
movie_itemsets = fpgrowth(df_movies, min_support=min_support, use_colnames=True, max_len=2)

In [15]:
len(movie_itemsets)

912

In [16]:
# Generate association rules and filter based on confidence and support
min_confidence = 0.6  
movie_rules = association_rules(movie_itemsets, metric="confidence", min_threshold=min_confidence)

In [17]:
# To make sure one item on the left and right sides for each rule
movie_rules = movie_rules[(movie_rules['antecedents'].apply(len) == 1) & 
                          (movie_rules['consequents'].apply(len) == 1)]

In [18]:
len(movie_rules)

255

In [21]:
sorted_by_lift = movie_rules.sort_values(by='lift', ascending=False).head(20)
print("Top 20 movie rules sorted by lift:\n", sorted_by_lift[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

Top 20 movie rules sorted by lift:
                                            antecedents  \
156                         (Kill Bill: Vol. 2 (2004))   
155                         (Kill Bill: Vol. 1 (2003))   
27                                      (Alien (1979))   
26                                     (Aliens (1986))   
237                    (Dark Knight Rises, The (2012))   
232                              (Interstellar (2014))   
126    (Lord of the Rings: The Two Towers, The (2002))   
127  (Lord of the Rings: The Return of the King, Th...   
8                     (Godfather: Part II, The (1974))   
200        (Indiana Jones and the Last Crusade (1989))   
229                             (Batman Begins (2005))   
129    (Lord of the Rings: The Two Towers, The (2002))   
130  (Lord of the Rings: The Fellowship of the Ring...   
125  (Lord of the Rings: The Fellowship of the Ring...   
124  (Lord of the Rings: The Return of the King, Th...   
221                                 

In [22]:
# Sort by leverage and display the top 5 rules
sorted_by_leverage = movie_rules.sort_values(by='leverage', ascending=False).head(20)
print("Top 20 movie rules sorted by leverage:\n", sorted_by_leverage[['antecedents', 'consequents', 'support', 'confidence', 'leverage']])

Top 20 movie rules sorted by leverage:
                                            antecedents  \
130  (Lord of the Rings: The Fellowship of the Ring...   
129    (Lord of the Rings: The Two Towers, The (2002))   
126    (Lord of the Rings: The Two Towers, The (2002))   
127  (Lord of the Rings: The Return of the King, Th...   
124  (Lord of the Rings: The Return of the King, Th...   
125  (Lord of the Rings: The Fellowship of the Ring...   
1          (Star Wars: Episode IV - A New Hope (1977))   
0    (Star Wars: Episode V - The Empire Strikes Bac...   
56   (Star Wars: Episode VI - Return of the Jedi (1...   
55         (Star Wars: Episode IV - A New Hope (1977))   
58   (Star Wars: Episode VI - Return of the Jedi (1...   
57   (Star Wars: Episode V - The Empire Strikes Bac...   
8                     (Godfather: Part II, The (1974))   
49   (Star Wars: Episode V - The Empire Strikes Bac...   
50   (Raiders of the Lost Ark (Indiana Jones and th...   
200        (Indiana Jones and th

In [23]:
# Sort by lift first, then leverage for a combined ranking
final_sorted_rules = movie_rules.sort_values(by=['lift', 'leverage'], ascending=[False, False])

In [24]:
final_sorted_rules.to_csv('final_movie_rules.csv', index=False)

In [25]:
# Display top 20 ranked rules
top_ranked_movie_rules = final_sorted_rules.head(20)
print("Top 20 Movie-to-Movie Rules Ranked by Interestingness:\n", 
      top_ranked_movie_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage']])

Top 20 Movie-to-Movie Rules Ranked by Interestingness:
                                            antecedents  \
155                         (Kill Bill: Vol. 1 (2003))   
156                         (Kill Bill: Vol. 2 (2004))   
26                                     (Aliens (1986))   
27                                      (Alien (1979))   
237                    (Dark Knight Rises, The (2012))   
232                              (Interstellar (2014))   
126    (Lord of the Rings: The Two Towers, The (2002))   
127  (Lord of the Rings: The Return of the King, Th...   
8                     (Godfather: Part II, The (1974))   
200        (Indiana Jones and the Last Crusade (1989))   
229                             (Batman Begins (2005))   
129    (Lord of the Rings: The Two Towers, The (2002))   
130  (Lord of the Rings: The Fellowship of the Ring...   
124  (Lord of the Rings: The Return of the King, Th...   
125  (Lord of the Rings: The Fellowship of the Ring...   
221             

Genre-to-Genre Association Rules

In [29]:
# genres for genre-level association mining
data_movies['genres'] = data_movies['genres'].str.split('|')
exploded_movies = data_movies.explode('genres')

In [30]:
# Merge with high ratings to get genres for highly rated movies
merged_genres = high_ratings.merge(exploded_movies[['movieId', 'genres']], on='movieId')

In [31]:
# Group genres by user and prepare for Transaction Encoding
user_genres = merged_genres.groupby(by="userId")["genres"].apply(list).tolist()

In [32]:
# Transaction encoding for genres
te_genres = TransactionEncoder()
te_genres_ary = te_genres.fit(user_genres).transform(user_genres)
df_genres = pd.DataFrame(te_genres_ary, columns=te_genres.columns_)

In [33]:
# FP-Growth for genre associations
genre_itemsets = fpgrowth(df_genres, min_support=min_support, use_colnames=True, max_len=2)

In [34]:
len(genre_itemsets)

190

In [35]:
# Generate genre-to-genre association rules and filter based on confidence
genre_rules = association_rules(genre_itemsets, metric="confidence", min_threshold=min_confidence)
# Filter for single item antecedents and consequents
genre_rules = genre_rules[(genre_rules['antecedents'].apply(len) == 1) & 
                          (genre_rules['consequents'].apply(len) == 1)]

In [36]:
len(genre_rules)

291

In [40]:
# Rank by interestingness metrics such as lift, leverage
ranked_genre_rules = genre_rules.sort_values(by=['lift', 'leverage'], ascending=[False, False])

In [41]:
ranked_genre_rules.to_csv('final_genre_rules.csv', index=False)

In [42]:
# Display top 20 ranked rules for genre-to-genre
top_ranked_genre_rules = ranked_genre_rules.head(20)
print("Top 20 Genre-to-Genre Rules Ranked by Interestingness:\n", 
      top_ranked_genre_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage']])

Top 20 Genre-to-Genre Rules Ranked by Interestingness:
        antecedents  consequents   support  confidence      lift  leverage  \
188    (Film-Noir)    (Western)  0.239859    0.725473  1.401484  0.068713   
288  (Documentary)    (Western)  0.199710    0.696076  1.344695  0.051193   
286  (Documentary)    (Musical)  0.225173    0.784824  1.302703  0.052322   
190    (Film-Noir)    (Musical)  0.246465    0.745453  1.237353  0.047278   
186    (Film-Noir)     (Horror)  0.297356    0.899375  1.233822  0.056352   
208     (Children)  (Animation)  0.698594    0.901085  1.226487  0.129005   
209    (Animation)   (Children)  0.698594    0.950872  1.226487  0.129005   
264    (Animation)    (Musical)  0.536976    0.730889  1.213179  0.094357   
265      (Musical)  (Animation)  0.536976    0.891308  1.213179  0.094357   
261      (Musical)   (Children)  0.559036    0.927925  1.196889  0.091962   
260     (Children)    (Musical)  0.559036    0.721075  1.196889  0.091962   
290  (Documentary)  