In [24]:
from sklearn.datasets import load_wine
import pandas as pd

from mlxtend.frequent_patterns import apriori

In [23]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

df = df.sample(frac=1)

df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0,0
119,12.0,3.43,2.0,19.0,87.0,2.0,1.64,0.37,1.87,1.28,0.93,3.05,564.0,1
117,12.42,1.61,2.19,22.5,108.0,2.0,2.09,0.34,1.61,2.06,1.06,2.96,345.0,1
42,13.88,1.89,2.59,15.0,101.0,3.25,3.56,0.17,1.7,5.43,0.88,3.56,1095.0,0
96,11.81,2.12,2.74,21.5,134.0,1.6,0.99,0.14,1.56,2.5,0.95,2.26,625.0,1


In [27]:
for col in data.feature_names:
    df[col] = df[col] > df[col].median()
    
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
8,True,False,False,False,False,True,True,False,True,True,True,True,True,0
119,False,True,False,False,False,False,False,True,True,False,False,True,False,1
117,False,False,False,True,True,False,False,False,True,False,True,True,False,1
42,True,True,True,False,True,True,True,False,True,True,False,True,True,0
96,False,True,True,True,True,False,False,False,True,False,False,False,False,1


In [29]:
target = df['target']
df = df.drop('target', axis=1)

In [32]:
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True)

In [33]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.477528,(alcohol)
1,0.5,(malic_acid)
2,0.483146,(ash)
3,0.466292,(alcalinity_of_ash)
4,0.460674,(magnesium)
5,0.5,(total_phenols)
6,0.5,(flavanoids)
7,0.466292,(nonflavanoid_phenols)
8,0.5,(proanthocyanins)
9,0.5,(color_intensity)


In [40]:
from itertools import chain, combinations

# Function to get all subsets of a set
def get_subsets(s):
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))


# Generate the new list
new_list = set()
for item in frequent_itemsets['itemsets']:
    if len(item) == 1:
        new_list.add(item)
    else:
        subsets = map(frozenset, get_subsets(item))
        new_list.update(subsets)
        
new_list = list(new_list)
new_list = [list(item) for item in new_list]
new_list

[['hue'],
 ['nonflavanoid_phenols'],
 ['od280/od315_of_diluted_wines'],
 ['magnesium'],
 ['total_phenols', 'flavanoids'],
 ['alcalinity_of_ash'],
 ['proline'],
 ['alcohol'],
 ['total_phenols'],
 ['flavanoids'],
 ['proanthocyanins'],
 ['malic_acid'],
 ['od280/od315_of_diluted_wines', 'flavanoids'],
 ['color_intensity'],
 ['ash']]

In [42]:
univar_list, multivar_list = [], []
for item in new_list:
    if len(item) == 1:
        univar_list.append(item[0])
    else:
        multivar_list.append(item)

univar_list, multivar_list

(['hue',
  'nonflavanoid_phenols',
  'od280/od315_of_diluted_wines',
  'magnesium',
  'alcalinity_of_ash',
  'proline',
  'alcohol',
  'total_phenols',
  'flavanoids',
  'proanthocyanins',
  'malic_acid',
  'color_intensity',
  'ash'],
 [['total_phenols', 'flavanoids'],
  ['od280/od315_of_diluted_wines', 'flavanoids']])

In [44]:
df = df[univar_list]

In [45]:
for item in multivar_list:
    # set it true if all the items in the itemset are true
    df[str(item)] = df[item].all(axis=1)

df.head()

Unnamed: 0,hue,nonflavanoid_phenols,od280/od315_of_diluted_wines,magnesium,alcalinity_of_ash,proline,alcohol,total_phenols,flavanoids,proanthocyanins,malic_acid,color_intensity,ash,"['total_phenols', 'flavanoids']","['od280/od315_of_diluted_wines', 'flavanoids']"
8,True,False,True,False,False,True,True,True,True,True,False,True,False,True,True
119,False,True,True,False,False,False,False,False,False,True,True,False,False,False,False
117,True,False,True,True,True,False,False,False,False,True,False,False,False,False,False
42,False,False,True,True,False,True,True,True,True,True,True,True,True,True,True
96,False,False,False,True,True,False,False,False,False,True,True,False,True,False,False


In [48]:
# importing necessary libraries 
import pandas as pd 
import numpy as np 
# !pip install kmodes 
from kmodes.kmodes import KModes 
import matplotlib.pyplot as plt 
%matplotlib inline 

# Building the model with 3 clusters 
kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1) 
clusters = kmode.fit_predict(df) 
# clusters

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 26, cost: 557.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 23, cost: 557.0
Run 2, iteration: 2/100, moves: 2, cost: 557.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 37, cost: 552.0
Run 3, iteration: 2/100, moves: 1, cost: 552.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 7, cost: 623.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 66, cost: 557.0
Run 5, iteration: 2/100, moves: 4, cost: 557.0
Best run was number 3


In [51]:
clusters, target.values

(array([0, 1, 1, 0, 2, 2, 2, 1, 2, 1, 1, 0, 2, 2, 1, 0, 0, 0, 0, 1, 1, 0,
        2, 2, 2, 0, 1, 0, 1, 2, 0, 2, 2, 2, 0, 0, 1, 0, 1, 2, 0, 2, 2, 1,
        0, 2, 2, 1, 2, 0, 2, 2, 2, 0, 2, 0, 0, 1, 1, 2, 0, 0, 0, 2, 2, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 2, 0, 0, 1, 2, 0, 0, 1, 1, 1, 0, 2, 2, 0,
        0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0, 1, 2,
        1, 2, 2, 0, 2, 2, 0, 0, 0, 1, 2, 2, 0, 0, 1, 2, 1, 2, 2, 0, 1, 2,
        0, 0, 2, 0, 0, 1, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 0, 2, 1, 0, 1, 2,
        0, 2, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 2,
        1, 0], dtype=uint16),
 array([0, 1, 1, 0, 1, 2, 1, 1, 2, 1, 1, 0, 2, 2, 1, 0, 1, 1, 0, 1, 1, 1,
        2, 2, 2, 0, 1, 1, 0, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, 2, 1, 1, 2, 1,
        1, 2, 2, 2, 2, 0, 1, 2, 2, 0, 2, 1, 0, 1, 1, 1, 1, 0, 1, 2, 2, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 2, 0, 0, 1, 1, 1, 0, 1, 2, 0,
        0, 1, 0, 2, 0, 0, 1, 2, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0, 1, 2,
        

In [50]:
## mapping the clusters to target variable
# new_clusters = []
# for i in clusters:
#     if i == 0:
#         new_clusters.append(1)
#     elif i == 1:
#         new_clusters.append(0)
#     else:
#         new_clusters.append(2)

In [52]:
(clusters == target.values).sum() / len(target.values)

0.8033707865168539