#**ASSOCIATION - MARKET BASKET ANALYSIS**

This File contains information about Customers buying different grocery items at a Mall.

## 1. Install & import libraries 

In [None]:
# Need to install apyori first
!pip install apyori

In [None]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from apyori import apriori
from wordcloud import WordCloud

## 2. Open our dataset

In [None]:
# Import data local
from google.colab import files
uploaded = files.upload()

In [None]:
#open our dataset 
grocery_item = pd.read_csv('Market_Basket_Optimisation.csv', header = None)

In [None]:
#see Row dan Column 
grocery_item.shape

We have 7501 row, 20 column 

In [None]:
#check sample in dataset
grocery_item.sample(10)

## 3. Visualisasi data

1. Wordcloud 

In [None]:
#Hasil Visualisasi Menggunakan wordcloud
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

plt.rcParams['figure.figsize'] = (15, 15)
wordcloud = WordCloud(background_color = 'white', width = 1200,  height = 1200, max_words = 121).generate(str(grocery_item[0]))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Popular Items',fontsize = 15)
plt.show()

Wordcloud still subjective, check the barchart
2. Barchart

In [None]:
#Visualisasi dn 
plt.rcParams['figure.figsize'] = (18, 7)
color = plt.cm.copper(np.linspace(0, 1, 40))
grocery_item[0].value_counts().head(40).plot.bar(color = color)
plt.title('frequency of most popular items', fontsize = 20)
plt.xticks(rotation = 90 )
plt.grid()
plt.show()

3. Tree Map

In [None]:
#install suarify 
! pip install squarify


In [None]:
# Visualization using tree map 
import squarify
y = grocery_item[0].value_counts().head(50).to_frame()
y.index
# plotting a tree map
plt.rcParams['figure.figsize'] = (20,10)
color = plt.cm.cool(np.linspace(0, 1, 50))
squarify.plot (sizes = y.values, label = y.index, alpha=.8, color = color)
plt.title('Tree Map for Popular Items in our grocery',fontsize = 20)
plt.axis('off')
plt.show()

## Preprocesing data 

In [None]:
# Data preprocessing
transactions = []
for i in range(0, 7501):
  transactions.append([str(grocery_item.values[i,j]) for j in range(0, 20)])

In [None]:
# conveting it into an numpy array
transactions = np.array(transactions)

In [None]:
# checking the shape of the array
print(transactions.shape)

Use Transaction Encoder, the data dimension show 7501 - 121

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data = te.fit_transform(transactions)
data = pd.DataFrame(data, columns = te.columns_)
# getting the shape of the data
data.shape

121?  to much column, we need to reduce it to 40 

In [None]:
import warnings
warnings.filterwarnings('ignore')

# getting correlations for 121 items would be messy 
# so let's reduce the items from 121 to 40

data = data.loc[:, ['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables', 'spaghetti',
                    'shrimp', 'grated cheese', 'eggs', 'cookies', 'french fries', 'herb & pepper', 'ground beef',
                    'tomatoes', 'milk', 'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
                    'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar', 'olive oil', 'champagne', 
                    'avocado', 'pepper', 'butter', 'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 
                    'chicken', 'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie', 'yogurt cake']]
# getting the shape of the data
data.shape

In [None]:
# let's check the columns

data.columns

## Algoritma Apriori (Asociation Rules) 

In [None]:
from mlxtend.frequent_patterns import apriori

#Now, let us return the items and itemsets with at least min 0.01 support:
apriori(data, min_support = 0.01, use_colnames = True)

set rules rules with  length = 2, dan support more than 0.01

In [None]:
frequent_itemsets = apriori(data, min_support = 0.05, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# getting th item sets with length = 2 and support more than 1%

frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.01) ]

we have three rules, pick the higest rules 0.059 (mineral water & Spaghetti) if you are buying spaghetti, usually you will buy mineral water