In [1]:
import pandas as pd

In [2]:
# Load the MovieLens dataset
ratings_file = 'ratings.csv'
movies_file = 'movies.csv'
ratings = pd.read_csv(ratings_file)
movies = pd.read_csv(movies_file)


rating threshold to determine whether a user has "purchased" a movie or not. Typically, a rating threshold above a certain value (e.g., 3.5 or 4) is used to indicate that the user liked the movie. You can filter the ratings dataframe based on this threshold.


In [3]:
# Set a rating threshold (e.g., 4.0)
threshold = 4.0
# Filter ratings based on the threshold
filtered_ratings = ratings[ratings['rating'] >= threshold]

# Merge the filtered ratings with the movies dataframe to get the movie names associated with the movie IDs.
merged_data = pd.merge(filtered_ratings, movies, on='movieId', how='inner')

# Group the data by the user ID, and within each user's group, collect the movie names or IDs as items in the transaction.
# Group the data by user ID and collect movie titles as items in transactions
transactions = merged_data.groupby('userId')['title'].apply(list).reset_index()

transactions = transactions.set_index('userId')
# Convert the transactions into a list of lists
transactions_list = transactions['title'].tolist()

In [4]:
transactions.head()

Unnamed: 0_level_0,title
userId,Unnamed: 1_level_1
1,"[Toy Story (1995), Grumpier Old Men (1995), He..."
2,"[Tommy Boy (1995), Gladiator (2000), Good Will..."
3,"[Conan the Barbarian (1982), Road Warrior, The..."
4,"[Star Wars: Episode IV - A New Hope (1977), Fu..."
5,"[Toy Story (1995), Usual Suspects, The (1995),..."


In [12]:
len(transactions)

609

In [5]:
# Convert sets to a single string with items separated by a comma
transactions["title"] = transactions["title"].agg(lambda x: ",".join(x))

movie_transactions = transactions["title"].str.split(',')

In [6]:
movie_transactions.head()

userId
1    [Toy Story (1995), Grumpier Old Men (1995), He...
2    [Tommy Boy (1995), Gladiator (2000), Good Will...
3    [Conan the Barbarian (1982), Road Warrior,  Th...
4    [Star Wars: Episode IV - A New Hope (1977), Fu...
5    [Toy Story (1995), Usual Suspects,  The (1995)...
Name: title, dtype: object

Apriori

In [7]:
def prune(data,supp):
  """
  pruning to get Candidates k-itemset to Frequent k-itemset
  by comparing the support count (data.supp_count) with the minSup (supp)
  """
  df = data[data.supp_count >= supp]
  return df

def count_itemset(transaction_df, itemsets):
  """
  for each itemset in (the current set of) freq-n itemsets,
  the count of each itemset is computed according to
  the number of occurrences that it appears in the transactions data
  """
  count_item = {}
  for item_set in itemsets:
      set_A = set(item_set) # set A represents the itemset whose count is to be computed in order to be determined if it is a frequent itemset or not
      for row in transaction_df:
          set_B = set(row) # set B represents the transaction row record
          if set_B.intersection(set_A) == set_A: # checks for occurrence of the itemset in the transaction
              if item_set in count_item.keys():
                  count_item[item_set] += 1

              else:
                  count_item[item_set] = 1

  data = pd.DataFrame()
  data['item_sets'] = count_item.keys()
  data['supp_count'] = count_item.values()
  # print("Candidate itemset table (Counting):\n", data)
  return data

def count_item(trans_items):

    count_ind_item = {}
    for row in trans_items:
        for i in range(len(row)):
            if row[i] in count_ind_item.keys():
                count_ind_item[row[i]] += 1
            else:
                count_ind_item[row[i]] = 1

    data = pd.DataFrame()
    data['item_sets'] = count_ind_item.keys()
    data['supp_count'] = count_ind_item.values()
    data = data.sort_values('item_sets')

    return data


def join(list_of_items):
    itemsets = []
    i = 1
    for entry in list_of_items:
        proceding_items = list_of_items[i:]
        for item in proceding_items:
            if(type(item) is str):
                if entry != item:
                    tuples = (entry, item)
                    itemsets.append(tuples)
            else:
                if entry[0:-1] == item[0:-1]:
                    tuples = entry+item[1:]
                    itemsets.append(tuples)
        i = i+1
    if(len(itemsets) == 0):
        return None
    return itemsets

In [8]:
def apriori(trans_data,supp=2):
    freq = pd.DataFrame()

    df = count_item(trans_data) # to generate counts of
    # print("df")
    # print(len(df))
    # print(df)
    while(len(df) != 0):

        df = prune(df, supp)
        # print("Minsup =", supp,"\n")
        # print("Freq itemset table (Pruned):\n", df)

        if len(df) > 1 or (len(df) == 1 and int(df.supp_count >= supp)):
            freq = df

        itemsets = join(df.item_sets)

        if(itemsets is None):
            return freq

        df = count_itemset(trans_data, itemsets)
    return df

# Freq item sets Movies

In [13]:
movies_freq_item_sets = apriori(movie_transactions, 60)

In [14]:
movies_freq_item_sets.head()

Unnamed: 0,item_sets,supp_count
0,"( The (1994), The (1999), The (2001), The (...",73
1,"( The (1994), The (1999), The (2001), The (...",62
6,"( The (1994), The (1999), The (2001), The (...",63
7,"( The (1994), The (1999), The (2001), The (...",61
19,"( The (1994), The (2001), The (2002), The (...",64


In [None]:
# from itertools import combinations

In [None]:
input_data=dict(movies_freq_item_sets['item_sets'])

transformed_data = []
record_id = 1

for _, items_tuple in input_data.items():
    items_set = set(items_tuple)
    transformed_data.append({"record_id": record_id, "items": items_set})
    record_id += 1

print(transformed_data)

In [None]:
# def get_all_freq_itemsets(items):
#   # Get all possible combinations
#   all_combinations = []

#   # Generate combinations of different lengths from 1 to the length of the set
#   for r in range(1, len(items) + 1):
#       item_combinations = combinations(items, r)
#       all_combinations.extend(item_combinations)

#   # Convert the combinations to a list of sets
#   combinations_list = [set(combination) for combination in all_combinations]

#   # Print the result
#   for combination in combinations_list:
#       print(combination, "\n")
#   return combinations_list

In [None]:
# # Define the set
# items = {"banana", "carrot", "edamame"}

# # Get all possible combinations
# all_combinations = []

# # Generate combinations of different lengths from 1 to the length of the set
# for r in range(1, len(items) + 1):
#     item_combinations = combinations(items, r)
#     all_combinations.extend(item_combinations)

# # Convert the combinations to a list of sets
# combinations_list = [set(combination) for combination in all_combinations]

# # Print the result
# for combination in combinations_list:
#     print(combination)

{'carrot'}
{'banana'}
{'edamame'}
{'carrot', 'banana'}
{'carrot', 'edamame'}
{'banana', 'edamame'}
{'carrot', 'banana', 'edamame'}


# Generate all frequent itemsets from the apriori result

In [None]:
movies_all_freq_itemsets = get_all_freq_itemsets(set(movies_freq_item_sets['item_sets']))

{' The (1994)'} 

{' The (1991)'} 

{'Pulp Fiction (1994)'} 

{'Silence of the Lambs'} 

{'Shawshank Redemption'} 

{' The (1991)', ' The (1994)'} 

{' The (1994)', 'Pulp Fiction (1994)'} 

{' The (1994)', 'Silence of the Lambs'} 

{' The (1994)', 'Shawshank Redemption'} 

{' The (1991)', 'Pulp Fiction (1994)'} 

{' The (1991)', 'Silence of the Lambs'} 

{' The (1991)', 'Shawshank Redemption'} 

{'Silence of the Lambs', 'Pulp Fiction (1994)'} 

{'Shawshank Redemption', 'Pulp Fiction (1994)'} 

{'Silence of the Lambs', 'Shawshank Redemption'} 

{' The (1991)', ' The (1994)', 'Pulp Fiction (1994)'} 

{' The (1991)', ' The (1994)', 'Silence of the Lambs'} 

{' The (1991)', ' The (1994)', 'Shawshank Redemption'} 

{' The (1994)', 'Pulp Fiction (1994)', 'Silence of the Lambs'} 

{' The (1994)', 'Shawshank Redemption', 'Pulp Fiction (1994)'} 

{' The (1994)', 'Shawshank Redemption', 'Silence of the Lambs'} 

{' The (1991)', 'Pulp Fiction (1994)', 'Silence of the Lambs'} 

{' The (1991)', 'Sh

In [None]:
len(movies_all_freq_itemsets)

31

# Convert the frequent itemsets into features for clustering

In [None]:
# Reference code
import pandas as pd
from itertools import combinations

# Sample data
data = [
    {"record_id": 1, "items": {"banana", "carrot"}},
    {"record_id": 2, "items": {"banana", "edamame"}},
    {"record_id": 3, "items": {"carrot", "edamame"}},
]

# Create a set of all unique items
all_items = set()
for entry in data:
    all_items.update(entry["items"])

# Generate all possible combinations of items
combinations_list = []
for r in range(1, len(all_items) + 1):
    item_combinations = combinations(all_items, r)
    combinations_list.extend(item_combinations)

# Create a DataFrame with features for each combination
df = pd.DataFrame(data)
for combination in combinations_list:
    feature_name = " & ".join(sorted(list(combination)))
    df[feature_name] = df["items"].apply(lambda x: int(set(combination).issubset(x)))

# Fill NaN values with 0
df.fillna(0, inplace=True)

# Set the "record_id" as the DataFrame index
df.set_index("record_id", inplace=True)

# Print the resulting DataFrame
print(df)


                       items  edamame  carrot  banana  carrot & edamame  \
record_id                                                                 
1           {carrot, banana}        0       1       1                 0   
2          {edamame, banana}        1       0       1                 0   
3          {edamame, carrot}        1       1       0                 1   

           banana & edamame  banana & carrot  banana & carrot & edamame  
record_id                                                                
1                         0                1                          0  
2                         1                0                          0  
3                         0                0                          0  


In [None]:
def convert_to_transdf(data):
    # Create a set of all unique items
    all_items = set()
    for entry in data:
      print("entry:", entry)
      all_items.update(entry["items"])

    # Generate all possible combinations of items
    combinations_list = []
    for r in range(1, len(all_items) + 1):
      item_combinations = combinations(all_items, r)
      combinations_list.extend(item_combinations)

    # Create a DataFrame with features for each combination
    df = pd.DataFrame(data)
    for combination in combinations_list:
      feature_name = " & ".join(sorted(list(combination)))
      df[feature_name] = df["items"].apply(lambda x: int(set(combination).issubset(x)))

    # Fill NaN values with 0
    df.fillna(0, inplace=True)

    # Set the "record_id" as the DataFrame index
    df.set_index("record_id", inplace=True)

    # Print the resulting DataFrame
    print(df)

    return df



In [None]:
print(dict(freq_item_sets['item_sets']))

{57: ('other vegetables', 'rolls/buns', 'sausage', 'rolls/buns', 'whole milk'), 66: ('other vegetables', 'rolls/buns', 'soda', 'rolls/buns', 'whole milk'), 73: ('other vegetables', 'rolls/buns', 'whole milk', 'rolls/buns', 'yogurt'), 88: ('other vegetables', 'soda', 'whole milk', 'soda', 'yogurt')}


In [None]:
input_data=dict(freq_item_sets['item_sets'])

transformed_data = []
record_id = 1

for _, items_tuple in input_data.items():
    items_set = set(items_tuple)
    transformed_data.append({"record_id": record_id, "items": items_set})
    record_id += 1

print(transformed_data)

[{'record_id': 1, 'items': {'rolls/buns', 'whole milk', 'sausage', 'other vegetables'}}, {'record_id': 2, 'items': {'rolls/buns', 'whole milk', 'soda', 'other vegetables'}}, {'record_id': 3, 'items': {'rolls/buns', 'whole milk', 'yogurt', 'other vegetables'}}, {'record_id': 4, 'items': {'whole milk', 'yogurt', 'soda', 'other vegetables'}}]


In [None]:
convert_to_transdf(transformed_data)

entry: {'record_id': 1, 'items': {'rolls/buns', 'whole milk', 'sausage', 'other vegetables'}}
entry: {'record_id': 2, 'items': {'rolls/buns', 'whole milk', 'soda', 'other vegetables'}}
entry: {'record_id': 3, 'items': {'rolls/buns', 'whole milk', 'yogurt', 'other vegetables'}}
entry: {'record_id': 4, 'items': {'whole milk', 'yogurt', 'soda', 'other vegetables'}}
                                                       items  rolls/buns  \
record_id                                                                  
1          {rolls/buns, whole milk, sausage, other vegeta...           1   
2           {rolls/buns, whole milk, soda, other vegetables}           1   
3          {rolls/buns, whole milk, yogurt, other vegetab...           1   
4               {whole milk, yogurt, soda, other vegetables}           0   

           sausage  other vegetables  soda  whole milk  yogurt  \
record_id                                                        
1                1                 1     0    

Unnamed: 0_level_0,items,rolls/buns,sausage,other vegetables,soda,whole milk,yogurt,rolls/buns & sausage,other vegetables & rolls/buns,rolls/buns & soda,...,other vegetables & sausage & whole milk & yogurt,sausage & soda & whole milk & yogurt,other vegetables & soda & whole milk & yogurt,other vegetables & rolls/buns & sausage & soda & whole milk,other vegetables & rolls/buns & sausage & soda & yogurt,other vegetables & rolls/buns & sausage & whole milk & yogurt,rolls/buns & sausage & soda & whole milk & yogurt,other vegetables & rolls/buns & soda & whole milk & yogurt,other vegetables & sausage & soda & whole milk & yogurt,other vegetables & rolls/buns & sausage & soda & whole milk & yogurt
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"{rolls/buns, whole milk, sausage, other vegeta...",1,1,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"{rolls/buns, whole milk, soda, other vegetables}",1,0,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,"{rolls/buns, whole milk, yogurt, other vegetab...",1,0,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"{whole milk, yogurt, soda, other vegetables}",0,0,1,1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
movie_transactions

userId
1      [Toy Story (1995), Grumpier Old Men (1995), He...
2      [Tommy Boy (1995), Gladiator (2000), Good Will...
3      [Conan the Barbarian (1982), Road Warrior,  Th...
4      [Star Wars: Episode IV - A New Hope (1977), Fu...
5      [Toy Story (1995), Usual Suspects,  The (1995)...
                             ...                        
606    [Usual Suspects,  The (1995), Canadian Bacon (...
607    [Toy Story (1995), Braveheart (1995), Fugitive...
608    [Seven (a.k.a. Se7en) (1995), Usual Suspects, ...
609    [Forrest Gump (1994), Fugitive,  The (1993), D...
610    [Toy Story (1995), Heat (1995), Seven (a.k.a. ...
Name: title, Length: 609, dtype: object

In [None]:

len(movie_transactions[0])

NameError: ignored

In [None]:
# Convert the series to the desired format
movie_data = [{'user_id': i, 'movies': set(items)} for i, items in enumerate(movie_transactions, start=1)]


In [None]:
# Create a set of all unique items
all_items = set()
for entry in movie_data:
    all_items.update(entry["movies"])

# Generate all possible combinations of items
combinations_list = []
for r in range(1, len(all_items) + 1):
    item_combinations = combinations(all_items, r)
    combinations_list.extend(item_combinations)

# Create a DataFrame with features for each combination
df = pd.DataFrame(data)
for combination in movie_f_sets:
    feature_name = " & ".join(sorted(list(combination)))
    df[feature_name] = df["movies"].apply(lambda x: int(set(combination).issubset(x)))

# Fill NaN values with 0
df.fillna(0, inplace=True)

# Set the "record_id" as the DataFrame index
df.set_index("user_id", inplace=True)

# Print the resulting DataFrame
print(df.head())