#### Importing Libraries
This block imports the necessary libraries for data manipulation (`pandas`) and for market basket analysis (`apriori` and `association_rules` from `mlxtend.frequent_patterns`).

In [1]:
# importing the necessary libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#### Reading and Preparing Data
This block reads the online retail data from an Excel file into a DataFrame, removes rows with missing invoice numbers, and converts the invoice numbers to string type. It also displays the first few rows of invoices that contain the letter 'C' and then removes these rows from the DataFrame.

In [None]:
# reading the data from an Excel file into a DataFrame
df = pd.read_excel('data/Online Retail.xlsx')

In [None]:
# removing rows with missing 'InvoiceNo' values
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

In [None]:
# converting the 'InvoiceNo' column to string type
df['InvoiceNo'] = df['InvoiceNo'].astype('str')

In [None]:
# displaying the first few rows where 'InvoiceNo' contains the letter 'C'
df[df.InvoiceNo.str.contains('C', na=False)].head()

In [None]:
# removing rows where 'InvoiceNo' contains the letter 'C'
df = df[~df['InvoiceNo'].str.contains('C')]

#### Filtering and Reshaping Data
This block filters the data to include only transactions made in the United Kingdom. It groups the data by invoice number and product description, sums the quantities, and then reshapes the data to have each product as a separate column. Missing values are filled with 0, and the invoice number is set as the index.

In [None]:
# filtering data for transactions made in the United Kingdom
# grouping the data by 'InvoiceNo' and 'Description' and summing the 'Quantity'
market_basket = df[df['Country'] == "United Kingdom"].groupby(['InvoiceNo', 'Description'])['Quantity']

In [None]:
# reshaping the data: unstacking the grouped data, resetting the index, filling NaN values with 0, and setting 'InvoiceNo' as the index
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')

In [None]:
# displaying the first few rows of the reshaped DataFrame
market_basket.head()

#### Encoding Data
This block defines a function to encode the quantities into binary values: 0 if the quantity is 0 or less, and 1 if the quantity is 1 or more. It then applies this encoding function to the entire DataFrame.

In [None]:
# defining a function to encode the data: 0 if the quantity is 0 or less, 1 if the quantity is 1 or more
def encode_data(datapoint):
    if datapoint <= 0:
        return 0
    if datapoint >= 1:
        return 1

In [None]:
# applying the encoding function to the market_basket DataFrame
market_basket = market_basket.applymap(encode_data)

#### Applying Apriori and Generating Association Rules
This block applies the apriori algorithm to the encoded DataFrame to find frequent itemsets with a minimum support of 0.03. It then generates association rules from these itemsets with a minimum lift of 0.5.

In [None]:
# applying the apriori algorithm to find itemsets with minimum support of 0.03
itemsets = apriori(market_basket, min_support=0.03, use_colnames=True)

In [None]:
# generating association rules with a minimum lift of 0.5
rules = association_rules(itemsets, metric="lift", min_threshold=0.5)

### MBA without MLxtend

#### Importing Libraries
This block imports the necessary libraries for generating combinations, grouping data, and counting occurrences.

In [None]:
# importing necessary libraries for combination generation, grouping, and counting
from itertools import combinations, groupby
from collections import Counter

#### Filtering Data for the United Kingdom
This block filters the DataFrame to include only transactions made in the United Kingdom, sets the index to 'InvoiceNo', and selects the 'StockCode' column to analyze the items in each order.

In [None]:
# filtering the data for transactions made in the United Kingdom
df_manual = df[df['Country'] == "United Kingdom"]

In [None]:
# setting the index to 'InvoiceNo' and selecting the 'StockCode' column
orders = df_manual.set_index('InvoiceNo')['StockCode']

#### Calculating Frequency and Support
This block calculates the frequency and support for each item. Support is the proportion of transactions that include a particular item.

In [None]:
# calculating the frequency of each stock code (item) and converting it to a DataFrame
statistics = orders.value_counts().to_frame("frequency")

In [None]:
# calculating the support for each item
statistics['support']  = statistics / len(set(orders.index)) * 100

#### Filtering Items Above Support Threshold
This block sets a minimum support threshold and filters the items and orders to include only those that meet this threshold.

In [None]:
# setting the minimum support threshold
min_support = 0.03 # same value we used above.

In [None]:
# filtering items that meet the minimum support threshold
items_above_support = statistics[statistics['support'] >= min_support].index

In [None]:
# keeping only orders that contain items above the support threshold
orders_above_support = orders[orders.isin(items_above_support)]

#### Filtering Orders with Multiple Items
This block counts the number of items in each order, filters to keep only orders with two or more items, and recalculates the frequency and support for these filtered orders.

In [None]:
# counting the number of orders per InvoiceNo
order_counts = orders.index.value_counts()

In [None]:
# filtering orders that have two or more items
orders_over_two_index = order_counts[order_counts >= 2].index

In [None]:
# keeping only orders that have two or more items
orders_over_two = orders[orders.index.isin(orders_over_two_index)]

In [None]:
# recalculating the frequency and support for the filtered orders
statistics = orders_over_two.value_counts().to_frame("frequency")
statistics['support']  = statistics / len(set(orders_over_two.index)) * 100

#### Generating Item Pairs
This block defines a function to generate pairs of items from each order, counts the frequency of these pairs, and calculates their support. It then filters pairs that meet the minimum support threshold.

In [None]:
# function to generate item pairs from orders
def itemset_generator(orders):
    orders = orders.reset_index().values
    for order_id, order_object in groupby(orders, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
        for item_pair in combinations(item_list, 2):
            yield item_pair

In [None]:
# generating item pairs from the filtered orders
itemsets_gen = itemset_generator(orders_over_two)

In [None]:
# counting the frequency of each item pair and converting it to a DataFrame
itemsets = pd.Series(Counter(itemsets_gen)).to_frame("frequencyAC")

In [None]:
# calculating the support for each item pair
itemsets['supportAC'] = itemsets['frequencyAC'] / len(orders_over_two_index) * 100

In [None]:
# filtering item pairs that meet the minimum support threshold
itemsets = itemsets[itemsets['supportAC'] >= min_support]

In [None]:
# resetting the index and renaming columns for clarity
itemsets = itemsets.reset_index().rename(columns={'level_0': 'antecedents', 'level_1': 'consequents'})

#### Merging Statistics and Calculating Metrics
This block merges statistics for antecedents and consequents, calculates confidence and lift metrics for the association rules, and selects relevant columns for the final rules DataFrame.

In [None]:
# merging statistics for antecedents and consequents
itemsets = (itemsets
     .merge(statistics.rename(columns={'frequency': 'freqA', 'support': 'antecedent support'}), left_on='antecedents', right_index=True)
     .merge(statistics.rename(columns={'frequency': 'freqC', 'support': 'consequents support'}), left_on='consequents', right_index=True))

In [None]:
# calculating confidence and lift metrics for the association rules
itemsets['confidenceAtoC'] = itemsets['supportAC'] / itemsets['antecedent support']
itemsets['confidenceCtoA'] = itemsets['supportAC'] / itemsets['consequents support']
itemsets['lift'] = itemsets['supportAC'] / (itemsets['antecedent support'] * itemsets['consequents support'])

In [None]:
# selecting relevant columns for the final rules DataFrame
itemsets = itemsets[['antecedents', 'consequents', 'antecedent support', 'consequents support', 'confidenceAtoC', 'lift']]

#### Finalizing and Sorting Rules
This block stores the final rules DataFrame, filters rules with confidence greater than 0.50, resets the index for readability, and sorts the rules by lift in descending order.

In [None]:
# storing the final rules DataFrame
rules = itemsets

In [None]:
# filtering rules with confidence greater than 0.50
rules_over_50 = rules[(rules.confidenceAtoC > 0.50)]

In [None]:
# resetting the index for better readability
rules_over_50.set_index('antecedents', inplace=True)
rules_over_50.reset_index(inplace=True)

In [None]:
# sorting the rules by lift in descending order
rules_over_50 = rules_over_50.sort_values('lift', ascending=False)