# Veri temizliği-veriyi anlamak

In [1]:
import pandas as pd
import collections

In [2]:
df = pd.read_csv("dataset.csv")
df = df.drop("Date", axis=1)
df = df.rename(columns={"Member_number":"member","itemDescription": "item"})

In [3]:
df

Unnamed: 0,member,item
0,1808,tropical fruit
1,2552,whole milk
2,2300,pip fruit
3,1187,other vegetables
4,3037,whole milk
...,...,...
38760,4471,sliced cheese
38761,2022,candy
38762,1097,cake bar
38763,1510,fruit/vegetable juice


In [4]:
print("Unique member sayısı: ",len(df["member"].unique()))
print("Unique item sayısı: ", len(df["item"].unique()))

Unique member sayısı:  3898
Unique item sayısı:  167


# Item'ların gruplanması

In [5]:
data = df.groupby("member")["item"].apply(lambda x: ",".join(x)).reset_index()
data["item"] = data["item"].apply(lambda x: sorted(x.split(',')))
data = data.drop("member", axis=1)

In [6]:
data

Unnamed: 0,item
0,"[canned beer, hygiene articles, misc. beverage..."
1,"[beef, curd, frankfurter, frankfurter, rolls/b..."
2,"[butter, butter milk, frozen vegetables, other..."
3,"[dental care, detergent, frozen meals, rolls/b..."
4,"[canned beer, chocolate, chocolate, cling film..."
...,...
3893,"[bottled beer, bottled water, decalcifier, des..."
3894,"[canned beer, curd, grapes, tropical fruit, wh..."
3895,"[curd, rolls/buns]"
3896,"[berries, berries, bottled water, butter milk,..."


# Minimum support (minsup) ve minimum confidence (minconf) değerleri giriş parametreleri olarak kullanıcı tarafından belirlenecektir.

In [7]:
min_frequency = int(input("Enter minimum frequency: "))
min_support = int(input("Enter minimum support: "))

Enter minimum frequency: 15
Enter minimum support: 20


# 1. Yaygın öğeler/sık satılan ürünler (frequent itemsets problem) problemini çözen bir Brute-Force yaklaşımı geliştiriniz

In [8]:
import itertools

#sepetlerin list of lists şekline getirilmesi
transactions = data['item'].tolist()

#listelerin stringe dönüştürülmesi
transactions = [[str(item) for item in sublist] for sublist in transactions]

#dataframe'deki tüm unique değerleirn bulunması
unique_items = set([item for sublist in transactions for item in sublist])

#Set a minimum support threshold
min_support = 20

#ürün -> ürün şeklinde kurallar oluşturmak için 2li kombinasyonların bulunması
possible_rules = list(itertools.combinations(unique_items, 2))

#kuralların tutulması
rules = []

#min_support ve min_confindence değerleri dikkate alınarak olası tüm kuralların çıkarılması
for rule in possible_rules:
    first = rule[0]
    second = rule[1]
    count = 0
    
#geçerli kuralın tüm sepetlerde bulunma sıklığı
    for transaction in transactions:
        if (first in transaction) and (second in transaction):
            count += 1
            
#minimum support değerini sağlayan setlerin confidence değerinin hesaplanması
    if count >= min_support:
        confidence = count / len([transaction for transaction in transactions if first in transaction])
        rules.append((first, second, count, confidence))

#kuraların kerana yazdırılması
for rule in rules:
    first = rule[0]
    second = rule[1]
    count = rule[2]
    confidence = rule[3]
    print("Rule: " + first + " -> " + second)
    print("Support: " + str(count))
    print("Confidence: " + str(confidence))
    print("=====================================")

Rule: flour -> bottled beer
Support: 23
Confidence: 0.1619718309859155
Rule: flour -> white bread
Support: 20
Confidence: 0.14084507042253522
Rule: flour -> domestic eggs
Support: 24
Confidence: 0.16901408450704225
Rule: flour -> newspapers
Support: 27
Confidence: 0.19014084507042253
Rule: flour -> pastry
Support: 26
Confidence: 0.18309859154929578
Rule: flour -> soda
Support: 46
Confidence: 0.323943661971831
Rule: flour -> brown bread
Support: 30
Confidence: 0.2112676056338028
Rule: flour -> canned beer
Support: 27
Confidence: 0.19014084507042253
Rule: flour -> pork
Support: 28
Confidence: 0.19718309859154928
Rule: flour -> yogurt
Support: 55
Confidence: 0.3873239436619718
Rule: flour -> root vegetables
Support: 34
Confidence: 0.23943661971830985
Rule: flour -> whipped/sour cream
Support: 37
Confidence: 0.2605633802816901
Rule: flour -> frankfurter
Support: 22
Confidence: 0.15492957746478872
Rule: flour -> citrus fruit
Support: 27
Confidence: 0.19014084507042253
Rule: flour -> coffee


# ==============

# 2. Bu problemin çözümü için Apriori Algoritmasının aşağıdaki adımlarını uygulayınız.

In [9]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [10]:
df = pd.read_csv("dataset.csv")

In [11]:
apriori_data = df.groupby(["Member_number","Date"])["itemDescription"].unique()
apriori_data

Member_number  Date      
1000           15-03-2015    [sausage, whole milk, semi-finished bread, yog...
               24-06-2014                    [whole milk, pastry, salty snack]
               24-07-2015                       [canned beer, misc. beverages]
               25-11-2015                          [sausage, hygiene articles]
               27-05-2015                           [soda, pickled vegetables]
                                                   ...                        
4999           24-01-2015    [tropical fruit, berries, other vegetables, yo...
               26-12-2015                               [bottled water, herbs]
5000           09-03-2014                      [fruit/vegetable juice, onions]
               10-02-2015         [soda, root vegetables, semi-finished bread]
               16-11-2014                     [bottled beer, other vegetables]
Name: itemDescription, Length: 14963, dtype: object

In [12]:
transactions_encoder = TransactionEncoder()
transactions_encoder_array = transactions_encoder.fit(apriori_data).transform(apriori_data)
transactions = pd.DataFrame(transactions_encoder_array, columns=transactions_encoder.columns_)

In [13]:
transactions_encoder_array

array([[False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [14]:
transactions_encoder.columns_

['Instant food products',
 'UHT-milk',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'candy',
 'canned beer',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cat food',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'coffee',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese ',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dishes',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen meals',
 'froze

In [15]:
transactions

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
items = apriori(transactions, min_support=0.003, use_colnames=True, verbose=1)
a = items.sort_values("support",ascending = False)
a.head(10)

Processing 1530 combinations | Sampling itemset size 32


Unnamed: 0,support,itemsets
108,0.157923,(whole milk)
67,0.122101,(other vegetables)
82,0.110005,(rolls/buns)
91,0.097106,(soda)
109,0.085879,(yogurt)
83,0.069572,(root vegetables)
101,0.067767,(tropical fruit)
7,0.060683,(bottled water)
86,0.060349,(sausage)
22,0.053131,(citrus fruit)


In [17]:
rules = association_rules(items, metric="confidence", min_threshold=0.001)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(beef),(whole milk),0.03395,0.157923,0.004678,0.137795,0.872548,-0.000683,0.976656
1,(whole milk),(beef),0.157923,0.03395,0.004678,0.029623,0.872548,-0.000683,0.995541
2,(other vegetables),(bottled beer),0.122101,0.045312,0.004678,0.038314,0.845568,-0.000854,0.992724
3,(bottled beer),(other vegetables),0.045312,0.122101,0.004678,0.103245,0.845568,-0.000854,0.978973
4,(bottled beer),(rolls/buns),0.045312,0.110005,0.00401,0.088496,0.804471,-0.000975,0.976403


In [18]:
rules.sort_values('confidence', ascending = False, inplace=True)
rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(bottled beer),(whole milk),0.045312,0.157923,0.007151,0.157817,0.99933,-5e-06,0.999874
182,(sausage),(whole milk),0.060349,0.157923,0.008955,0.148394,0.939663,-0.000575,0.988811
101,(newspapers),(whole milk),0.038896,0.157923,0.005614,0.14433,0.913926,-0.000529,0.984114
70,(domestic eggs),(whole milk),0.037091,0.157923,0.00528,0.142342,0.901341,-0.000578,0.981834
88,(hamburger meat),(whole milk),0.021854,0.157923,0.003074,0.140673,0.890769,-0.000377,0.979926
78,(frankfurter),(whole milk),0.03776,0.157923,0.00528,0.139823,0.885388,-0.000683,0.978958
0,(beef),(whole milk),0.03395,0.157923,0.004678,0.137795,0.872548,-0.000683,0.976656
73,(frankfurter),(other vegetables),0.03776,0.122101,0.005146,0.136283,1.11615,0.000536,1.01642
82,(frozen vegetables),(whole milk),0.028002,0.157923,0.003809,0.136038,0.861422,-0.000613,0.974669
149,(pork),(whole milk),0.037091,0.157923,0.005012,0.135135,0.855703,-0.000845,0.973652


In [19]:
rules.sort_values('lift', ascending = False, inplace=True)
rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
6,(sausage),(bottled beer),0.060349,0.045312,0.003342,0.055371,1.222,0.000607,1.010649
7,(bottled beer),(sausage),0.045312,0.060349,0.003342,0.073746,1.222,0.000607,1.014464
73,(frankfurter),(other vegetables),0.03776,0.122101,0.005146,0.136283,1.11615,0.000536,1.01642
72,(other vegetables),(frankfurter),0.122101,0.03776,0.005146,0.042146,1.11615,0.000536,1.004579
184,(sausage),(yogurt),0.060349,0.085879,0.005748,0.095238,1.108986,0.000565,1.010345
185,(yogurt),(sausage),0.085879,0.060349,0.005748,0.066926,1.108986,0.000565,1.007049
129,(pastry),(sausage),0.051728,0.060349,0.003208,0.062016,1.027617,8.6e-05,1.001777
128,(sausage),(pastry),0.060349,0.051728,0.003208,0.053156,1.027617,8.6e-05,1.001509
179,(soda),(sausage),0.097106,0.060349,0.005948,0.061253,1.014975,8.8e-05,1.000963
178,(sausage),(soda),0.060349,0.097106,0.005948,0.09856,1.014975,8.8e-05,1.001613


In [20]:
rules.sort_values('support', ascending = False, inplace=True)
rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
122,(other vegetables),(whole milk),0.122101,0.157923,0.014837,0.121511,0.76943,-0.004446,0.958551
123,(whole milk),(other vegetables),0.157923,0.122101,0.014837,0.093948,0.76943,-0.004446,0.968928
162,(whole milk),(rolls/buns),0.157923,0.110005,0.013968,0.088447,0.804028,-0.003404,0.97635
163,(rolls/buns),(whole milk),0.110005,0.157923,0.013968,0.126974,0.804028,-0.003404,0.96455
196,(whole milk),(soda),0.157923,0.097106,0.011629,0.073635,0.758296,-0.003707,0.974663
197,(soda),(whole milk),0.097106,0.157923,0.011629,0.119752,0.758296,-0.003707,0.956636
209,(whole milk),(yogurt),0.157923,0.085879,0.011161,0.070673,0.82294,-0.002401,0.983638
208,(yogurt),(whole milk),0.085879,0.157923,0.011161,0.129961,0.82294,-0.002401,0.967861
109,(rolls/buns),(other vegetables),0.110005,0.122101,0.010559,0.09599,0.786154,-0.002872,0.971117
108,(other vegetables),(rolls/buns),0.122101,0.110005,0.010559,0.086481,0.786154,-0.002872,0.974249


# Brute force mantığı
-Önce olası tüm item kombinasyonları belirlenmeli

bir perakende mağazasının satış verilerinde elma öğesi 1000 kez görüldüyse, elma öğesi için support değeri 1000 olur.

"eğer bir müşteri elma satın alırsa, o müşterinin portakal da satın alma olasılığı yüksektir" gibi bir ilişki kuralında, confident değeri, elma satın alan müşterilerin portakal da satın alma olasılığını gösterir. Confident değeri, 0 ile 1 arasında bir değer alır, ve 1'e ne kadar yakınsarsa ilişki kuralının doğruluğu o kadar yüksektir.

In [21]:
from itertools import chain, combinations

# Veri kümesi
data = [('MILK', 'BREAD', 'BISCUIT'),        ('BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'),        ('BREAD', 'TEA', 'BISCUIT'),        ('JAM', 'BREAD', 'TEA', 'MILK'),        ('TEA', 'MILK', 'BISCUIT')]

# Öğe listelerini oluştur
item_lists = []
for basket in data:
  item_lists.append(list(basket))

# Tüm olası öğe kümelerini oluştur
item_sets = []
for items in item_lists:
  for i in range(1, len(items)+1):
    item_sets.extend(list(combinations(items, i)))

# Öğe kümelerinin sıklıklarını say
frequencies = {}
for item_set in item_sets:
  if item_set in frequencies:
    frequencies[item_set] += 1
  else:
    frequencies[item_set] = 1

# En sık tekrar eden öğe kümelerini bul
max_frequency = max(frequencies.values())
frequent_itemsets = [item_set for item_set, frequency in frequencies.items() if frequency == max_frequency]

# Sonuçları yazdır
print("En sık tekrar eden öğe kümeleri:")
for itemset in frequent_itemsets:
  print(itemset)


En sık tekrar eden öğe kümeleri:
('MILK',)
('BREAD',)
('BISCUIT',)


In [22]:
baskets = []

for b in range(1, 101):
    basket = []
    for i in range(1, 101):
        if b % i == 0:
            basket.append(i)
            baskets.append(basket)

baskets


[[1],
 [1, 2],
 [1, 2],
 [1, 3],
 [1, 3],
 [1, 2, 4],
 [1, 2, 4],
 [1, 2, 4],
 [1, 5],
 [1, 5],
 [1, 2, 3, 6],
 [1, 2, 3, 6],
 [1, 2, 3, 6],
 [1, 2, 3, 6],
 [1, 7],
 [1, 7],
 [1, 2, 4, 8],
 [1, 2, 4, 8],
 [1, 2, 4, 8],
 [1, 2, 4, 8],
 [1, 3, 9],
 [1, 3, 9],
 [1, 3, 9],
 [1, 2, 5, 10],
 [1, 2, 5, 10],
 [1, 2, 5, 10],
 [1, 2, 5, 10],
 [1, 11],
 [1, 11],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 13],
 [1, 13],
 [1, 2, 7, 14],
 [1, 2, 7, 14],
 [1, 2, 7, 14],
 [1, 2, 7, 14],
 [1, 3, 5, 15],
 [1, 3, 5, 15],
 [1, 3, 5, 15],
 [1, 3, 5, 15],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 17],
 [1, 17],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 19],
 [1, 19],
 [1, 2, 4, 5, 10, 20],
 [1, 2, 4, 5, 10, 20],
 [1, 2, 4, 5, 10, 20],
 [1, 2, 4, 5, 10, 20],
 [1, 2,