# Data Science Basics - Apriori Algorithm - Online Retail Dataset

In [None]:
import pandas as pd

# Install mlxtend in Anaconda to a particular env by running below command -
# conda install --name datascience -c conda-forge mlxtend
# in a terminal
from mlxtend.frequent_patterns import apriori, association_rules

df = pd.read_excel("data/Online_Retail.xlsx")
df.head()

## Cleaning data. Removing spaces from description, and data that doesn't has invoice no and negative Quantity transaction, with Invoice starting with "C"

In [None]:
df["Description"] = df["Description"].str.strip()
df.dropna(axis=0, subset=["InvoiceNo"], inplace=True)
df["InvoiceNo"] = df["InvoiceNo"].astype("str")
df = df[-df["InvoiceNo"].str.contains("C")]
df

## Consolidating items in to one transaction per row for each product and to keep the data small we are only looking for sales in france

In [None]:
basket = (
    df[df["Country"] == "France"]
    .groupby(["InvoiceNo", "Description"])["Quantity"]
    .sum()
    .unstack()
    .reset_index()
    .fillna(0)
    .set_index("InvoiceNo")
)
basket

## Making sure anything positive is converted to 1 and anything negative is converted to 0

In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1


basket_sets = basket.applymap(encode_units)
basket_sets.drop(
    "POSTAGE", inplace=True, axis=1
)  # Dropping postage column as its of not much use
basket_sets

## Randomly choosing min support and threshold

In [None]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

## Adding other constraints on rules such as lift and confidence level

In [None]:
rules[(rules["lift"] >= 6) & (rules["confidence"] >= 0.8)]