In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [6]:
# prepare the data
# How many columns are there?
# How are values separated?
# How are tabs represented in python?
# Is there any text encoding?

In [8]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']

In [11]:
user_ratings = pd.read_csv('u.data', sep='\t', names=column_names, encoding='utf-8')

In [12]:
user_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [13]:
item_column_names = ['item_id', 'title']

In [18]:
movie_titles = pd.read_csv('u.item', sep='|', names=item_column_names, encoding='latin-1', usecols=range(2))

In [19]:
movie_titles

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [21]:
df_merged_user_film = pd.merge(user_ratings, movie_titles, on='item_id')

In [22]:
df_merged_user_film.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [23]:
# Step 2: Analysis and Exploration

In [25]:
# Grouping by title and aggregate using count() on the 'rating' column
ratings_count = df_merged_user_film.groupby('title')['rating'].count()

In [26]:
ratings_count

title
'Til There Was You (1997)                  9
1-900 (1994)                               5
101 Dalmatians (1996)                    109
12 Angry Men (1957)                      125
187 (1997)                                41
                                        ... 
Young Guns II (1990)                      44
Young Poisoner's Handbook, The (1995)     41
Zeus and Roxanne (1997)                    6
unknown                                    9
Á köldum klaka (Cold Fever) (1994)         1
Name: rating, Length: 1664, dtype: int64

In [29]:
top_10_most_reviewed = ratings_count.sort_values(ascending=False).head(10)

In [30]:
top_10_most_reviewed

title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
Name: rating, dtype: int64

In [None]:
# Top 20 films based on their mean average rating

In [33]:
average_ratings = df_merged_user_film.groupby('title')['rating'].mean()
top_20_highest_average_rating = average_ratings.sort_values(ascending=False).head(20)
top_20_highest_average_rating

title
They Made Me a Criminal (1939)                            5.000000
Marlene Dietrich: Shadow and Light (1996)                 5.000000
Saint of Fort Washington, The (1993)                      5.000000
Someone Else's America (1995)                             5.000000
Star Kid (1997)                                           5.000000
Great Day in Harlem, A (1994)                             5.000000
Aiqing wansui (1994)                                      5.000000
Santa with Muscles (1996)                                 5.000000
Prefontaine (1997)                                        5.000000
Entertaining Angels: The Dorothy Day Story (1996)         5.000000
Pather Panchali (1955)                                    4.625000
Some Mother's Son (1996)                                  4.500000
Maya Lin: A Strong Clear Vision (1994)                    4.500000
Anna (1996)                                               4.500000
Everest (1998)                                          

In [34]:
# Top 7 films with the highest total rating

In [36]:
total_ratings = df_merged_user_film.groupby('title')['rating'].sum()
top_7_highest_total_rating = total_ratings.sort_values(ascending = False).head(7)
top_7_highest_total_rating

title
Star Wars (1977)                  2541
Fargo (1996)                      2111
Return of the Jedi (1983)         2032
Contact (1997)                    1936
Raiders of the Lost Ark (1981)    1786
Godfather, The (1972)             1769
English Patient, The (1996)       1759
Name: rating, dtype: int64

In [37]:
# Step 3: Prepare the data for Association Rules using the pivot() method

In [38]:
df_merged_user_film['liked'] =  (df_merged_user_film['rating'] >= 3.5).astype(int)

In [39]:
df_merged_user_film.head(10)

Unnamed: 0,user_id,item_id,rating,timestamp,title,liked
0,196,242,3,881250949,Kolya (1996),0
1,63,242,3,875747190,Kolya (1996),0
2,226,242,5,883888671,Kolya (1996),1
3,154,242,3,879138235,Kolya (1996),0
4,306,242,5,876503793,Kolya (1996),1
5,296,242,4,884196057,Kolya (1996),1
6,34,242,5,888601628,Kolya (1996),1
7,271,242,4,885844495,Kolya (1996),1
8,201,242,4,884110598,Kolya (1996),1
9,209,242,4,883589606,Kolya (1996),1


In [42]:
user_movie_matrix = df_merged_user_film.pivot(index='user_id', columns='item_id', values='liked')
user_movie_matrix.head(2)

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,,,,,,,,,,
2,1.0,,,,,,,,,0.0,...,,,,,,,,,,


In [47]:
user_movie_matrix = user_movie_matrix.fillna(0)
user_movie_matrix.head(10)

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# step 4: Apply the apriori algorithm and generate Association Rules

In [49]:
# use apriori algorithm to find frequent itemsets (movies watched together)
frequent_itemsets = apriori(user_movie_matrix, min_support=0.1, use_colnames=True)

# Generate association rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)



In [50]:
# Step 5: Sort the rules based on one of the Association Rules metrics