In [1]:
"""
In this program we will look at affinity analysis which determines when objects occur frequently together.
The aim is to discover when objects occur simultaneously. 
In a case where we wish to work out when two movies are recommended by the same reviewers.
Then we simply rank the results and choose the recommendations.
We will do it using the algorithm for affinity analysis called Apriori algorithm.
"""

import os
import pandas as pd
data_folder = os.path.join(os.path.expanduser("~"), "OneDrive", "Desktop", "Pace", "CS619", "Chapter04", "ml-100k")
ratings_filename = os.path.join(data_folder, "u.data")

In [2]:
# Making changes into datafile since it is separated by tabs, not commas, and there is no heading line.
# Setting delimiter parameter to tab character, not reading first row as header and setting column names.
all_ratings = pd.read_csv(ratings_filename, delimiter="\t", header=None, names=["UserID", "MovieID", "Rating", "Datetime"])

In [3]:
# Parsing dates for reviews
all_ratings["Datetime"] = pd.to_datetime(all_ratings["Datetime"], unit='s')

In [4]:
# Reviewing first records
all_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
# Getting description of data
all_ratings.describe()

Unnamed: 0,UserID,MovieID,Rating
count,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986
std,266.61442,330.798356,1.125674
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


In [6]:
"""
The goal here is to produce rule of the form: "if a person recommends the set of movies, they will also recommend this movie"
It will also be discussed extensions where a person who recommends a set of movies is likely to recommend another particular movie.
"""

# First of all, determining if a person recommends a movie
all_ratings["Favorable"] = all_ratings["Rating"] > 3

# Checking new feature
all_ratings[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,62,257,2,1997-11-12 22:07:14,False
11,286,1014,5,1997-11-17 15:38:45,True
12,200,222,5,1997-10-05 09:05:40,True
13,210,40,3,1998-03-27 21:59:54,False
14,224,29,3,1998-02-21 23:40:57,False


In [7]:
# Sampling the dataset to form training data by obtaining all reviews from the first 200 users
ratings = all_ratings[all_ratings["UserID"].isin(range(200))]

In [8]:
# Next, creating a dataset of only the favorable reviews in our sample
favorable_ratings_mask = ratings["Favorable"]
favorable_ratings = ratings[favorable_ratings_mask]

In [9]:
# Grouping the dataset by UserID and iterating over the movies in each group to get which one user has given favorable rating
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])

In [10]:
# Finally, creating a dataframe that tell how frequently each movie has been given a favorable review
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()

# Checking top five movies
num_favorable_by_movie.sort_values(by="Favorable", ascending=False).head()

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0


In [11]:
# Creating a dictionary to store discovered frequent itemsets where the key is the length of the itemsets
frequent_itemsets = {}

# Setting a minumum support value to be considered frequent
min_support = 50

In [12]:
# First step of Apriori: 
# - Creating an itemset with each movie and test if the itemset is frequent
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                           for movie_id, row in num_favorable_by_movie.iterrows()
                           if row["Favorable"] > min_support)

In [13]:
# Second and Third steps of Apriori:
# - Creating a function that takes the newly discovered frequent itemsets, creates the supersets, and then tests if they are frequent
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items()
                if frequency >= min_support])

In [14]:
# Creating a loop that iterates over the steps of the algorithm, storing the new itemsets as k is increased from 1 to max value
import sys     # We will get an error if we don't import this library because we need to use sys.stdout.flush()

for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
    if len(cur_frequent_itemsets) == 0:
        print(f'Did not find any frequent itemsets of length {k}')
        sys.stdout.flush()
        break
    else:
        print(f'I found {len(cur_frequent_itemsets)} frequent itemsets of length {k}')
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets

I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11


In [15]:
# Generating a number of association rules by setting each movie to be the conclusion and the remaining as the premise
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

# Checking first five rules created
print(candidate_rules[:5])

[(frozenset(), 1), (frozenset(), 7), (frozenset(), 9), (frozenset(), 50), (frozenset(), 56)]


In [16]:
# Computing confidence:
# - Creating dictionaries to store how many times we see the premise leading to the conclusion and how many times it doesn't
# - Then iterating over all reviews and rules, working out whether the premise of the rule applies and, if it does, whether the conclusion is accurate

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)

for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [17]:
# Computing confidence for each rule
rule_confidence = {candidate_rule: 
                   (correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]))
                  for candidate_rule in candidate_rules}

In [18]:
# Printing top five rules

from operator import itemgetter

sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print(f'Rule #{index+1}')
    premise, conclusion = sorted_confidence[index][0]
    print(f'Rule: If a person recommends {premise} they will also recommend {conclusion}')
    print(f'- Confidence: {rule_confidence[(premise, conclusion)]:.3f}')
    print('')

Rule #1
Rule: If a person recommends frozenset({98, 181}) they will also recommend 50
- Confidence: 1.000

Rule #2
Rule: If a person recommends frozenset({172, 79}) they will also recommend 174
- Confidence: 1.000

Rule #3
Rule: If a person recommends frozenset({258, 172}) they will also recommend 174
- Confidence: 1.000

Rule #4
Rule: If a person recommends frozenset({1, 181, 7}) they will also recommend 50
- Confidence: 1.000

Rule #5
Rule: If a person recommends frozenset({1, 172, 7}) they will also recommend 174
- Confidence: 1.000



In [19]:
# Loading dataset that stores the movie names and their corresponding MovieID to show the result
movie_name_filename = os.path.join(data_folder, "u.item")
movie_name_data = pd.read_csv(movie_name_filename, delimiter="|", header=None, encoding="mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure", 
                           "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", 
                          "Horror", "Musical", "Mistery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

In [20]:
# Creating a function that returns a movie's title from its MovieID

def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [21]:
# Printing top five rules including movie titles

for index in range(5):
    print(f'Rule #{index+1}')
    premise, conclusion = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print(f'Rule: If a person recommends {premise_names} they will also recommend {conclusion_name}')
    print(f'- Confidence: {rule_confidence[(premise, conclusion)]:.3f}')
    print('')

Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
- Confidence: 1.000

Rule #2
Rule: If a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000

Rule #3
Rule: If a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000

Rule #4
Rule: If a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
- Confidence: 1.000

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
- Confidence: 1.000



In [22]:
# Extracting test dataset which is all of the records we didn't use in training set (first 200 users)
test_dataset = all_ratings[~all_ratings["UserID"].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) 
                               for k, v in test_favorable.groupby("UserID")["MovieID"])

In [23]:
# Counting correct instances where the premise leads to the conclusion using test dataset
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [24]:
# Computing the confidence of each rule from the correct counts
test_confidence = {candidate_rule:
                  (correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]))
                  for candidate_rule in rule_confidence}
sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)

In [25]:
# Printing the best association rules with the titles instead of movie IDs

for index in range(10):
    print(f'Rule #{index+1}')
    premise, conclusion = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print(f'Rule: If a person recommends {premise_names} they will also recommend {conclusion_name}')
    print(f'- Train Confidence: {rule_confidence.get((premise, conclusion), -1):.3f}')
    print(f'- Test Confidence: {test_confidence.get((premise, conclusion), -1):.3f}')
    print('')

Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
- Train Confidence: 1.000
- Test Confidence: 0.936

Rule #2
Rule: If a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
- Train Confidence: 1.000
- Test Confidence: 0.876

Rule #3
Rule: If a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
- Train Confidence: 1.000
- Test Confidence: 0.841

Rule #4
Rule: If a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
- Train Confidence: 1.000
- Test Confidence: 0.932

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
- Train Confidence: 1.000
- Test Confidence: 0.903

R