# Step 1: Importing Required Libraries..

In [None]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Step 2: Dataset Loading & Preprocessing

In [None]:
dataset = pd.read_csv("../input/groceries/groceries.csv", sep=";", header= None)

# when we set header=None it will consider as csv file which has no header.

In [None]:
dataset.head()

for above output:

* Each row is one transction.

* Products in each row is nothing but items purchased by buyer/customer.

 ### Finding out all unique items available at grocery.

In [None]:
unique_items_list = []

# for each index it will iter row by row
for index, row in dataset.iterrows():  
    
    # splitting items with , and creating a new list for row & it will going add it agian 
    # ...item_series list for each iteration..so item_series will be list of lists..
    items_series = list(row.str.split(','))
    
    
    # agian reading each list elements from item_Series which is big list as mentioned above code
    for each_row_list in items_series:
        
        # iterating each item from each_row_lists
        for item in each_row_list:
            
            # for first iteration..unique_items_list is empty so first item directly append to it.
            #...from next onwards..it will start to check condition 'not in'
            #....& if item not found in unique_items_list list then it will append to it.
            #......finally we will get one unique item list..
            if item not in unique_items_list:
                unique_items_list.append(item)

 ### Now, Generating empty Dataframe with unique_items_list elements as column names.

In [None]:
df_apriori = pd.DataFrame(columns=unique_items_list)

In [None]:
df_apriori

In [None]:
dataset1 =df_apriori.copy()

 ### Sorting items from main dataset agian & assigning in respective column. 

In [None]:
## If for the item names obesrved w.r.t. each list will be assigned as number 1 & those items are not in 
##...row number iterating will be assigned with nuber 0.

for index, row in dataset.iterrows():
    items = str(row[0]).split(',')
    one_hot_encoding = np.zeros(len(unique_items_list),dtype=int)
    for item_name in items:
        for i,column in enumerate(dataset1.columns):
            if item_name == column:
                one_hot_encoding[i] = 1
    dataset1.at[index] = one_hot_encoding

# Transction encoder is fastest method to do all this.

In [None]:
dataset1.head()

In [None]:
# shape of the dataset1

dataset1.shape

In [None]:
dataset1.info()

In [None]:
# Sinced efault datatype saved as 'object'. Converting in 'integer' datatype

dataset1 = dataset1.astype('uint8')
dataset1.info()

# Step 3: EDA

In [None]:
dataset1.head()

 ### Checking for Purchased and not purchased item qty. details to get insights

In [None]:
zero =[]
one = []
for i in df_apriori.columns:
    zero.append(list(dataset1[i].value_counts())[0])
    one.append(list(dataset1[i].value_counts())[1])

In [None]:
count_df = pd.DataFrame([zero,one], columns=df_apriori.copy().columns)

In [None]:
count_df.head()

In [None]:
# Changing row names...

count_df.index = ['Not_Purchased', 'Purchased']
count_df

Grocery shop contains total 169 numbers of items.

In [None]:
# CHECKING WHICH PRODUCTE

print('maximum purchased item:',count_df.idxmax(axis = 1)[1],':',count_df.loc['Purchased'].max())
print('minimum purchased item:',count_df.idxmax(axis = 1)[0],':',count_df.loc['Not_Purchased'].max())

In [None]:
# Simplest way to sort elements..

sorted_df = pd.DataFrame(count_df.sort_values(by=['Purchased'],axis=1,ascending=False).transpose())
sorted_df.head(20)

In [None]:
# adding Purchased% table into the dataset1.

sorted_df['Purchased%']= sorted_df.Purchased/sum(sorted_df.Purchased)
sorted_df.head()

In [None]:
# Finding out avergae of the total purchased% so that we get idea about min_support value setting.

np.mean(sorted_df['Purchased%'])

In [None]:
# Plotting sorted top purchased products..

fig = plt.subplots(figsize=(20,10))
purchased = sorted_df.head(50).xs('Purchased' ,axis = 1)
purchased.plot(kind='bar',fontsize=16)
plt.title('Purchased top Count',fontsize=30)
plt.xlabel('Products', fontsize=20)
plt.ylabel('total qty. purchased', fontsize=20)
plt.show()

 ### People purchased more is daily need items & transction for all them is above 1000 nos.

'whole milk', 'other vegetables', 'rolls/buns', 'soda', 'yogurt','bottled water', 'root vegetables', 'tropical fruit'

# Step 4: Apriori Rule
***

ref. used: https://www.kdnuggets.com/2016/04/association-rules-apriori-algorithm-tutorial.html

## 4.1 Measure 1: Support 
***

 #### Concept:

![](https://github.com/ShrikantUppin/Association_Rules/blob/main/measure1_formula.png?raw=true)

<img src="https://github.com/ShrikantUppin/Association_Rules/blob/main/measure1.png?raw=true" width="300" height="300">


 This says how popular an itemset is, as measured by the proportion of transactions in which an itemset appears. In Table 1 below, the support of {apple} is 4 out of 8, or 50%. Itemsets can also contain multiple items. For instance, the support of {apple, beer, rice} is 2 out of 8, or 25%.


***

 ### Finding out support for each possible products or diff. product sets present in transction dataframe(dataset1)

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

freq_items = apriori(dataset1, min_support=0.02, use_colnames=True, max_len=5)

# min_support value can be choose by the user/business need
# max_len is item combinations..here i have taken as 5. total items in combination formed should not be more than 5

In [None]:
freq_items.shape

<p algin='justify'> This says how popular an itemset is, as measured by the proportion of transactions in which an itemset appears.</p> 

For example:

In Table sorted_df, the support of {whole milk} is 2513 out of total 9835 row tranctions. i.e. 25.55%. 


Itemsets can also contain multiple items. For instance, the support of {bottled water, soda} is 285 out of 9835, or 2.89%

In [None]:
# checking first 10 rows

freq_items.head(10)

In [None]:
# checking last 10 rows 

freq_items.tail(10)

If you discover that sales of items beyond a certain proportion tend to have a significant impact on your profits, you might consider using that proportion as your support threshold. You may then identify itemsets with support values above this threshold as significant itemsets.

***
***

## 4.2 Measure 2: Confidence
***
***

 #### Concept:

 This says how likely item Y is purchased when item X is purchased, expressed as {X -> Y}. This is measured by the proportion of transactions with item X, in which item Y also appears. In Table 1, the confidence of {apple -> beer} is 3 out of 4, or 75%.



<img src="https://github.com/ShrikantUppin/Association_Rules/blob/main/measure2.png?raw=true" >

***
* Drawbacks of Confidence measure:
***

* it might misrepresent the importance of an association. 

* This is because it only accounts for how popular apples are, but not beers. If beers are also very popular in general, there will be a higher chance that a transaction containing apples will also contain beers, thus inflating the confidence measure. 


Note: To account for the base popularity of both constituent items, we use a third measure called lift.

 ### Building Association rules using confidence metrics.

In [None]:
# for this we need support value dataframe..that is fre_items from measure1.

confidence_association = association_rules(freq_items, metric='confidence', min_threshold=0.2)

# min_threshold is nothing but setting min % crieteria. In this case i have choosen 20% 
#...confidence should be minimum 20%.

In [None]:
# checking combination in first 10 rows from dataset

confidence_association.head(10)

In [None]:
0.028978*100

In [None]:
# checking combination in last 10 rows from dataset

confidence_association.tail(10)

 ### Understanding terminologies:
 ***
 
 [Basic Terminology](https://michael.hahsler.net/research/recommender/associationrules.html#:~:text=Leverage%20measures%20the%20difference%20of,expected%20from%20the%20independent%20sells)

 #### 1 . Antecedent and Consequent
 
The IF component of an association rule is known as the antecedent. The THEN component is known as the consequent. The antecedent and the consequent are disjoint; they have no items in common.


 #### 2. antecedent support
 
 It is antecedent support with all transction numbers.
 
 
 #### 3. consequent support

 It is consequent  support with all transction numbers.
 
 
 #### 4. Support:
 
 Here support is considered for antecedent+consequent combination.
 
 
 #### 5. confidence
 
 Confidence is related to 'consequent item' or 'consequent item combination' w.r.t. antecedent item  or item set.
 
 
 #### 6. lift
 
Lift measures how many times more often X and Y occur together than expected if they where statistically independent. Lift is not down-ward closed and does not suffer from the rare item problem.
 
 In short firm possibilities of buying consequent whenever Antecedent item is purchaed by customer
 
 
 #### 7. Leverage
 
 Leverage measures the difference of X and Y appearing together in the data set and what would be expected if X and Y where statistically dependent. The rational in a sales setting is to find out how many more units (items X and Y together) are sold than expected from the independent sells.
 
 leverage also can suffer from the rare item problem.
 
 leverage(X -> Y) = P(X and Y) - (P(X)P(Y))
 
 
 #### 8. conviction
 
 conviction(X -> Y) = P(X)P(not Y)/P(X and not Y)=(1-sup(Y))/(1-conf(X -> Y))

Conviction compares the probability that X appears without Y if they were dependent with the actual frequency of the appearance of X without Y. In that respect it is similar to lift (see section about lift on this page), however, it contrast to lift it is a directed measure. Furthermore, conviction is monotone in confidence and lift.


#### 9. Coverage

coverage(X) = P(X) = sup(X)

A simple measure of how often a item set appears in the data set.

## 4.3 Measure 3: Lift 
***

 #### Concept:
    
This says how likely item Y is purchased when item X is purchased, while controlling for how popular item Y is. In Table 1, the lift of {apple -> beer} is 1,which implies no association between items. A lift value greater than 1 means that item Y is likely to be bought if item X is bought, while a value less than 1 means that item Y is unlikely to be bought if item X is bought.


<img src="https://github.com/ShrikantUppin/Association_Rules/blob/main/measure3.png?raw=true" >

 ### Building Association rules using confidence metrics.

In [None]:
lift_association = association_rules(freq_items, metric="lift", min_threshold=1)

In [None]:
lift_association.shape

In [None]:
lift_association.head(10)

In [None]:
lift_association.tail(5)

## 4.4 Eliminating redudancy sets...
 

In [None]:
# As per above output observation, it is clear that when same items repeated..
#...(for ex: in first row: A-->B, and in next row B-->A) gives same leverage & lift but confidence is different.
#...this is known as redudency when same item set shuffled as ancedents & consequent.
#.... so to eliminates in easist way..will sort n the basis of leverage & confidence.

redundancy = lift_association.sort_values(by=['leverage','confidence'],axis=0, ascending=False).reset_index()
redundancy = redundancy.drop(['index'], axis=1)
redundancy.shape
redundancy.head()

In [None]:
redundancy.tail()

In [None]:
# Now check output of above cells, when leverage and lift are same for consequent rows..then compare with the value of confidence
#...if confidence of middle cell found less than two side cells..drop it.
# dropping odd index rows..since it contains less confidence
# ultimately this will help us to elminate repeated combination..which has low lift & confidence..

unique_rules = redundancy.iloc[::2]
unique_rules.shape

In [None]:
unique_rules.head(10)

 ### Summary:
 
 
 * freq_items = apriori(dataset1, min_support=0.02, use_colnames=True, max_len=5)
 
 

 * confidence_association = association_rules(freq_items, metric='confidence', min_threshold=0.2)
 
 
 * lift_association = association_rules(freq_items, metric="lift", min_threshold=1)
 

# Step 5: Generated Rules analysis/Processing

We have obtained unique_rules with metric='lift'. Now, this unique_rules dataframe will be used for analysis..just filtering as per threshold value set/required & obtaining diff. pairs of item sets.


Note: lift is set to 1. in previous code. Since if lift is equal to or greater than 1..that means chances to pick consequents items by customer is more..!!

In [None]:
top_20 = unique_rules[unique_rules['lift']>1.5]

 ### Top 20 combinations w.r.t. Lift more than 1.5

In [None]:
top_20_sort = top_20.sort_values(by='lift', ascending=False)
top_20_sort.reset_index(inplace=True)

In [None]:
top_20_sort = top_20_sort.drop(['index'],axis=1)

In [None]:
top_20_sort.head()

 ### Unique item names from to 20 lift combinations..

In [None]:
x = top_20_sort[['antecedents','consequents']]

In [None]:
item_list = []
for i in x.antecedents.to_list():
    for j in list(set(i)):
        item_list .append(j)

In [None]:
for p in x.consequents.to_list():
    for q in list(set(p)):
        item_list.append(q)

In [None]:
def unique(list1): 
    # insert the list to the set 
    list_set = set(list1) 
    # convert the set to the list 
    unique_list = (list(list_set))
    top_items =[]
    for m in unique_list:
        top_items.append(m)
    print(top_items)

In [None]:
unique(item_list)

above are the top 20 products items & the shuffled combination gives top lift result.

***
***