In [1]:
# !pip install mlxtend

In [2]:
import pprint

import numpy as np
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [3]:
# with is used as good practise as it is house cleaner type of thing (Don't remember exact word)
data = []
with open('./categories.txt') as file_handler:  # In default open opens file in read mode

    for line in file_handler:
        splitted_line = line.strip('\n').split(';')  # .strip(..) returns string without \n so can perform split operation
        data.append(splitted_line)

pprint.pprint(data[:100])        

[['Breakfast & Brunch', 'American (Traditional)', 'Restaurants'],
 ['Sandwiches', 'Restaurants'],
 ['Local Services', 'IT Services & Computer Repair'],
 ['Restaurants', 'Italian'],
 ['Food', 'Coffee & Tea'],
 ['Fast Food', 'Restaurants'],
 ['Mortgage Brokers', 'Home Services', 'Real Estate'],
 ['Brasseries', 'Restaurants'],
 ['Bars',
  'Sports Bars',
  'Nightlife',
  'American (New)',
  'Chicken Wings',
  'Restaurants'],
 ['Automotive',
  'Windshield Installation & Repair',
  'Auto Detailing',
  'Wheel & Rim Repair'],
 ['Automotive', 'Auto Parts & Supplies'],
 ['Food', 'Grocery', 'CSA', 'Farmers Market'],
 ['Specialty Schools', 'CPR Classes', 'First Aid Classes', 'Education'],
 ['Event Planning & Services', 'Venues & Event Spaces'],
 ['Shopping', 'Home Decor', 'Home & Garden', 'Furniture Stores'],
 ['Books, Mags, Music & Video', 'Shopping', 'Bookstores'],
 ['Auto Repair', 'Automotive'],
 ['Local Services', 'Dry Cleaning & Laundry'],
 ['Burgers', 'American (New)', 'Restaurants'],
 ['Piz

In [4]:
# Transforming categories two dimensional data into Transaction encoded dataframe

transaction_encoded = TransactionEncoder()

transaction_encoded_array = transaction_encoded.fit_transform(data)

df = pd.DataFrame(transaction_encoded_array, columns=transaction_encoded.columns_)
df.head()

Unnamed: 0,ATV Rentals/Tours,Accessories,Accountants,Active Life,Acupuncture,Adult,Adult Education,Adult Entertainment,Advertising,Afghan,...,Wine Bars,Wine Tasting Classes,Wine Tasting Room,Wine Tours,Wineries,Wok,Women's Clothing,Yelp Events,Yoga,Zoos
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77185 entries, 0 to 77184
Columns: 892 entries, ATV Rentals/Tours to Zoos
dtypes: bool(892)
memory usage: 65.7 MB


In [6]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True) # min_support is 0.01 as default value is 0.5 and it should be a value between 0 and 1, use_colnames=True uses the columns for the returning dataframe
frequent_itemsets.head() # support is between 0 and 1 and support is calculated as  transactions_where_item(s)_occur / total_transactions  so thats why ..

Unnamed: 0,support,itemsets
0,0.040202,(Active Life)
1,0.020639,(American (New))
2,0.031301,(American (Traditional))
3,0.029423,(Arts & Entertainment)
4,0.022232,(Auto Repair)


In [7]:
frequent_itemsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   101 non-null    float64
 1   itemsets  101 non-null    object 
dtypes: float64(1), object(1)
memory usage: 1.7+ KB


In [8]:
# length column
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda categories: len(categories))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.040202,(Active Life),1
1,0.020639,(American (New)),1
2,0.031301,(American (Traditional)),1
3,0.029423,(Arts & Entertainment),1
4,0.022232,(Auto Repair),1
...,...,...,...
96,0.010598,"(Sports Bars, Nightlife, Bars)",3
97,0.010028,"(Fast Food, Restaurants, Burgers)",3
98,0.010663,"(Dentists, Health & Medical, General Dentistry)",3
99,0.018540,"(Hotels, Hotels & Travel, Event Planning & Ser...",3


In [9]:
# Fixing support column so we can understand it and casting it as int
frequent_itemsets['support'] = len(df) * frequent_itemsets['support']  # As support is  transactions_where_item(s)_occur / total_transactions  so thats why multiplying with total_transactions
frequent_itemsets['support'] = frequent_itemsets['support'].astype(int)

frequent_itemsets

Unnamed: 0,support,itemsets,length
0,3103,(Active Life),1
1,1593,(American (New)),1
2,2416,(American (Traditional)),1
3,2271,(Arts & Entertainment),1
4,1716,(Auto Repair),1
...,...,...,...
96,818,"(Sports Bars, Nightlife, Bars)",3
97,774,"(Fast Food, Restaurants, Burgers)",3
98,823,"(Dentists, Health & Medical, General Dentistry)",3
99,1430,"(Hotels, Hotels & Travel, Event Planning & Ser...",3


In [10]:
# length 1 frequent itemsets
frequent_itemsets_len_1 = frequent_itemsets[ frequent_itemsets['length']==1 ]
frequent_itemsets_len_1

Unnamed: 0,support,itemsets,length
0,3103,(Active Life),1
1,1593,(American (New)),1
2,2416,(American (Traditional)),1
3,2271,(Arts & Entertainment),1
4,1716,(Auto Repair),1
5,4208,(Automotive),1
6,1115,(Bakeries),1
7,4328,(Bars),1
8,6583,(Beauty & Spas),1
9,1369,(Breakfast & Brunch),1


In [15]:
# Removing length column as required and casting itemsets into string as its frozenset
frequent_itemsets_len_1.drop(columns='length', inplace=True)
frequent_itemsets_len_1['itemsets'] = frequent_itemsets_len_1['itemsets'].apply(lambda itemset: str(itemset))

frequent_itemsets_len_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frequent_itemsets_len_1['itemsets'] = frequent_itemsets_len_1['itemsets'].apply(lambda itemset: str(itemset))


Unnamed: 0,support,itemsets
0,3103,Active Life
1,1593,American (New)
2,2416,American (Traditional)
3,2271,Arts & Entertainment
4,1716,Auto Repair
5,4208,Automotive
6,1115,Bakeries
7,4328,Bars
8,6583,Beauty & Spas
9,1369,Breakfast & Brunch


In [13]:
# Exporting frequent itemsets with length 1 as text file
frequent_itemsets_len_1.to_csv('./patterns.txt', index=None, header=None, sep=':')

In [18]:
# Removing length column as required and casting itemsets into ; separated string as its frozenset
frequent_itemsets.drop(columns='length', inplace=True)
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda itemset: ';'.join(itemset))

frequent_itemsets

Unnamed: 0,support,itemsets
0,3103,Active Life
1,1593,American (New)
2,2416,American (Traditional)
3,2271,Arts & Entertainment
4,1716,Auto Repair
...,...,...
96,818,Sports Bars;Nightlife;Bars
97,774,Fast Food;Restaurants;Burgers
98,823,Dentists;Health & Medical;General Dentistry
99,1430,Hotels;Hotels & Travel;Event Planning & Services


In [20]:
# Exporting frequent itemsets as text file
frequent_itemsets.to_csv('./patterns-all.txt', index=None, header=None, sep=':')