# Association
## Apriori Algorithm
Suppose we have transactions based on users' ratings for movies. A transaction of movies is considered as the set of movies that a user has highly rated. An association rule is a statement of the form A -> B, where A and B are itemsets. 
- Support of A -> B = |AB|
- confidence of A -> B = |AB|/|A|

In [1]:
import pandas as pd
import numpy as np
import csv
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print('There are', ratings_df['userId'].nunique(), 'users that have given', len(ratings_df), 'ratings consisting of', len(movies_df), 'movies')

There are 610 users that have given 100836 ratings consisting of 9742 movies


In [5]:
# merge dataframe to show what users have rated movies
movie_ratings_df = pd.merge(ratings_df, movies_df, on='movieId')
movie_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [6]:
# Pivot the dataframe to show userId's on the left hand side of the matrix (key of each row) and movies to be 
# the key of columns
movie_ratings_df = pd.merge(ratings_df, movies_df, on='movieId' )
movie_ratings_df.head()
movie_user_ratings_df = pd.pivot_table(movie_ratings_df, index='userId', columns='title', values='rating')
# once pivoted, axis: {0=userId row, 1=movieId column}

print('dataset dimensions: ', movie_user_ratings_df.shape)
print('Subset example:')
movie_user_ratings_df.iloc[:10, :40]

dataset dimensions:  (610, 9719)
Subset example:


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,"10th Victim, The (La decima vittima) (1965)","11'09""01 - September 11 (2002)",11:14 (2003),"11th Hour, The (2007)",12 Angry Men (1957),12 Angry Men (1997),12 Chairs (1971),12 Chairs (1976),12 Rounds (2009),12 Years a Slave (2013)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,5.0,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Method to sort the matrix by users by applying a column with counts of non-Nan ratings that users made,
# sorting the matrix in descending order by this column, then removing this helper column
def sortByUsers(matrix_df, num_col):
    matrix_df = movie_user_ratings_df.append(matrix_df.count(axis=0), ignore_index=True)
    matrix_sorted_col_df = matrix_df.sort_values(len(matrix_df)-1, axis=1, ascending=False)
    matrix_sorted_col_df = matrix_sorted_col_df.drop(matrix_sorted_col_df.tail(1).index)
    return matrix_sorted_col_df.iloc[:, :num_col]
# Method to sort the matrix by movies by applying a row of total ratings non-Nan that each movie had gotten,
#  sorting the matrix in descending order by this row, then removing this helper row.
def sortByMovies(matrix_df, num_rows):
    matrix_df['total_ratings'] = pd.Series(matrix_df.count(axis=1))
    matrix_sorted_row_df = matrix_df.sort_values('total_ratings', ascending=False)
    matrix_sorted_by_movies = matrix_sorted_row_df.drop(['total_ratings'], axis=1)
    return matrix_sorted_by_movies.iloc[:num_rows, :]

# both methods only output the amount of rows or columns specified

In [8]:
# sort by users then by movies
userSortedMatrix = sortByUsers(movie_user_ratings_df, 100)
movieUserSortedMatrix = sortByMovies(userSortedMatrix, 100)

In [9]:
# Subset of of the first rows and columns that will have the least Nan values
movieUserSortedMatrix.head()

title,Forrest Gump (1994),"Shawshank Redemption, The (1994)",Pulp Fiction (1994),"Silence of the Lambs, The (1991)","Matrix, The (1999)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),Braveheart (1995),Terminator 2: Judgment Day (1991),Schindler's List (1993),...,Ocean's Eleven (2001),Willy Wonka & the Chocolate Factory (1971),"Fifth Element, The (1997)",Batman Begins (2005),Home Alone (1990),Ghost (1990),Waterworld (1995),Catch Me If You Can (2002),"Breakfast Club, The (1985)","Bourne Identity, The (2002)"
413,5.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0,...,4.0,4.0,5.0,4.5,3.0,3.0,2.0,4.0,5.0,4.0
67,3.5,3.0,2.0,3.5,4.5,5.0,3.5,2.5,3.5,4.0,...,3.5,0.5,3.0,2.5,1.0,2.0,4.0,3.5,3.0,4.5
479,5.0,5.0,4.0,4.5,5.0,4.5,5.0,5.0,4.5,5.0,...,3.5,3.5,3.5,4.0,4.0,3.5,3.0,5.0,3.5,2.0
273,4.5,4.5,5.0,4.0,4.0,3.0,3.5,4.5,4.5,4.0,...,4.0,3.5,4.0,3.5,3.5,,4.0,4.0,4.0,3.5
598,3.5,4.0,5.0,3.0,5.0,5.0,4.0,3.5,4.5,,...,4.0,3.0,4.0,3.0,3.0,2.5,2.5,3.0,3.5,3.0


In [10]:
# All movie titles in the sorted matrix
movieIds = list(movieUserSortedMatrix.columns)
print(movieIds)

['Forrest Gump (1994)', 'Shawshank Redemption, The (1994)', 'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)', 'Matrix, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)', 'Jurassic Park (1993)', 'Braveheart (1995)', 'Terminator 2: Judgment Day (1991)', "Schindler's List (1993)", 'Fight Club (1999)', 'Toy Story (1995)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Usual Suspects, The (1995)', 'American Beauty (1999)', 'Seven (a.k.a. Se7en) (1995)', 'Independence Day (a.k.a. ID4) (1996)', 'Apollo 13 (1995)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Godfather, The (1972)', 'Fugitive, The (1993)', 'Batman (1989)', 'Saving Private Ryan (1998)', 'Lord of the Rings: The Two Towers, The (2002)', 'Lord of the Rings: The Return of the King, The (2003)', 'Aladdin (1992)', 'Fargo (1996)', 'Sixth Sense, The (1999)', '

In [11]:
# Specifying which movies we'll include in the transactions (this could change upon testing)
for movieId in movieIds:
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 0.5, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 1, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 1.5, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 2, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 2.5, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 3, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 3.5, movieId] = '0'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 4, movieId] = '1'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 4.5, movieId] = '1'
    movieUserSortedMatrix.loc[movieUserSortedMatrix[movieId] == 5, movieId] = '1'

In [12]:
movieUserSortedMatrix.head()

title,Forrest Gump (1994),"Shawshank Redemption, The (1994)",Pulp Fiction (1994),"Silence of the Lambs, The (1991)","Matrix, The (1999)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),Braveheart (1995),Terminator 2: Judgment Day (1991),Schindler's List (1993),...,Ocean's Eleven (2001),Willy Wonka & the Chocolate Factory (1971),"Fifth Element, The (1997)",Batman Begins (2005),Home Alone (1990),Ghost (1990),Waterworld (1995),Catch Me If You Can (2002),"Breakfast Club, The (1985)","Bourne Identity, The (2002)"
413,1,1,1,1,1,1,1,1,1,1.0,...,1,1,1,1,0,0.0,0,1,1,1
67,0,0,0,0,1,1,0,0,0,1.0,...,0,0,0,0,0,0.0,1,0,0,1
479,1,1,1,1,1,1,1,1,1,1.0,...,0,0,0,1,1,0.0,0,1,0,0
273,1,1,1,1,1,0,0,1,1,1.0,...,1,0,1,0,0,,1,1,1,0
598,0,1,1,0,1,1,1,0,1,,...,1,0,1,0,0,0.0,0,0,0,0


In [13]:
# Turn dataframe into a 2D array with 0's and 1's and existing Nan values
unclean_apriori_data = movieUserSortedMatrix.to_numpy()
print(unclean_apriori_data)

[['1' '1' '1' ... '1' '1' '1']
 ['0' '0' '0' ... '0' '0' '1']
 ['1' '1' '1' ... '1' '0' '0']
 ...
 ['1' '1' nan ... '0' '0' nan]
 ['1' '0' '1' ... nan nan '0']
 ['0' '1' '1' ... nan '0' nan]]


In [14]:
data = []
# Iterate through 2D array and create a list of movie titles for each row that had movies with a 1 value.
# Each list will correspond as a transaction in the dataset
for row in unclean_apriori_data:
    movies_in_row = []
    for i in range(len(row)):
        if row[i] == '1':
            movies_in_row.append(movieIds[i])
    if len(movies_in_row) >= 2:
        data.append(movies_in_row)

In [15]:
# All transactions
print(data[:1])

[['Forrest Gump (1994)', 'Shawshank Redemption, The (1994)', 'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)', 'Matrix, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)', 'Jurassic Park (1993)', 'Braveheart (1995)', 'Terminator 2: Judgment Day (1991)', "Schindler's List (1993)", 'Fight Club (1999)', 'Toy Story (1995)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Usual Suspects, The (1995)', 'American Beauty (1999)', 'Seven (a.k.a. Se7en) (1995)', 'Independence Day (a.k.a. ID4) (1996)', 'Apollo 13 (1995)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Godfather, The (1972)', 'Fugitive, The (1993)', 'Batman (1989)', 'Saving Private Ryan (1998)', 'Lord of the Rings: The Two Towers, The (2002)', 'Lord of the Rings: The Return of the King, The (2003)', 'Aladdin (1992)', 'Fargo (1996)', 'True Lies (1994)', 'Twelve

## Transform your data into apriori algorithm

In [16]:
# Use the transaction encoder to turn values into True or False
oht = TransactionEncoder()
oht_array = oht.fit(data).transform(data)
favourite_movies_df = pd.DataFrame(oht_array, columns = oht.columns_)
favourite_movies_df.head()

Unnamed: 0,Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),Apollo 13 (1995),Austin Powers: The Spy Who Shagged Me (1999),Babe (1995),...,Titanic (1997),Toy Story (1995),True Lies (1994),"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)",Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
0,False,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,False,True,True
1,False,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,True,False,True,False,True
2,False,True,True,True,True,True,True,False,True,True,...,True,False,False,True,True,False,False,False,False,True
3,True,True,True,True,False,True,True,False,True,True,...,False,True,True,True,True,False,True,True,False,False
4,False,False,True,True,True,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [17]:
# Apply the apriori algoithm from mlxtend (shown in tutorial) to generate frequent itemsets
from mlxtend.frequent_patterns import apriori

# The min_support can change with testing
frequent_itemsets = apriori(favourite_movies_df, min_support=0.4, use_colnames=True)
pd.set_option('max_colwidth', 1500)
pd.set_option("max_rows", None)
frequent_itemsets.head(10)  

Unnamed: 0,support,itemsets
0,0.4,(Aladdin (1992))
1,0.53,(Alien (1979))
2,0.4,(Aliens (1986))
3,0.41,"(Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001))"
4,0.62,(American Beauty (1999))
5,0.47,(American History X (1998))
6,0.54,(Back to the Future (1985))
7,0.4,(Batman Begins (2005))
8,0.41,(Blade Runner (1982))
9,0.4,"(Bourne Identity, The (2002))"


In [18]:
# Extract association rules by specifying min_threshold (that can change with testing)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)
rule_results = rules[['antecedents', 'consequents', 'support', 'confidence']]
rule_results.head(100)

Unnamed: 0,antecedents,consequents,support,confidence
0,(Alien (1979)),(Fight Club (1999)),0.42,0.792453
1,(Alien (1979)),"(Godfather, The (1972))",0.41,0.773585
2,(Alien (1979)),"(Matrix, The (1999))",0.46,0.867925
3,(Alien (1979)),(Pulp Fiction (1994)),0.47,0.886792
4,(Reservoir Dogs (1992)),(Alien (1979)),0.4,0.714286
5,(Alien (1979)),(Reservoir Dogs (1992)),0.4,0.754717
6,(Alien (1979)),"(Silence of the Lambs, The (1991))",0.43,0.811321
7,(Alien (1979)),(Star Wars: Episode IV - A New Hope (1977)),0.42,0.792453
8,(Alien (1979)),(Star Wars: Episode V - The Empire Strikes Back (1980)),0.42,0.792453
9,(Fargo (1996)),(American Beauty (1999)),0.41,0.745455
