# Veri temizliği-veriyi anlamak

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset.csv")
df = df.drop("Date", axis=1)
df = df.rename(columns={"Member_number":"member","itemDescription": "item"})

In [3]:
df

Unnamed: 0,member,item
0,1808,tropical fruit
1,2552,whole milk
2,2300,pip fruit
3,1187,other vegetables
4,3037,whole milk
...,...,...
38760,4471,sliced cheese
38761,2022,candy
38762,1097,cake bar
38763,1510,fruit/vegetable juice


In [4]:
print("Unique member sayısı: ",len(df["member"].unique()))
print("Unique item sayısı: ", len(df["item"].unique()))

Unique member sayısı:  3898
Unique item sayısı:  167


# Item'ların gruplanması

In [5]:
data = df.groupby("member")["item"].apply(lambda x: ",".join(x)).reset_index()
data["item"] = data["item"].apply(lambda x: sorted(x.split(',')))
data = data.drop("member", axis=1)

In [9]:
data

Unnamed: 0,item
0,"[canned beer, hygiene articles, misc. beverage..."
1,"[beef, curd, frankfurter, frankfurter, rolls/b..."
2,"[butter, butter milk, frozen vegetables, other..."
3,"[dental care, detergent, frozen meals, rolls/b..."
4,"[canned beer, chocolate, chocolate, cling film..."
...,...
3893,"[bottled beer, bottled water, decalcifier, des..."
3894,"[canned beer, curd, grapes, tropical fruit, wh..."
3895,"[curd, rolls/buns]"
3896,"[berries, berries, bottled water, butter milk,..."


In [11]:
import pandas as pd
import collections

# load the dataframe

# convert the column containing lists of items into a single list
items = [item for sublist in data['item'].tolist() for item in sublist]

# use the Counter class from the collections module to count the frequency of each item
counter = collections.Counter(items)

# create a dataframe with the items and their frequencies
frequent_itemsets = pd.DataFrame(counter.items(), columns=['item', 'frequency'])

# filter the dataframe to only show itemsets that meet the minimum frequency threshold
min_frequency = 2
frequent_itemsets = frequent_itemsets[frequent_itemsets['frequency'] >= min_frequency]

# print the resulting frequent itemsets
print(frequent_itemsets)


                     item  frequency
0             canned beer        717
1        hygiene articles        208
2         misc. beverages        238
3                  pastry        785
4      pickled vegetables        134
..                    ...        ...
162        frozen chicken          5
163        salad dressing          6
164  specialty vegetables         11
165        toilet cleaner          5
166       rubbing alcohol          5

[165 rows x 2 columns]


In [18]:
import itertools

# load the dataframe


# convert the column containing lists of items into a list of lists
transactions = df['item'].tolist()

# Find all unique items in the dataframe
unique_items = set([item for sublist in transactions for item in sublist])

# Set a minimum support threshold
min_support = 2

# Generate all possible rules with a length of 2
possible_rules = list(itertools.combinations(unique_items, 2))

# Create an empty list to store the rules that meet the minimum support threshold
rules = []

# Iterate through the possible rules
for rule in possible_rules:
    antecedent = rule[0]
    consequent = rule[1]
    count = 0
    
    # Count the number of transactions that contain the rule
    for transaction in transactions:
        if (antecedent in transaction) and (consequent in transaction):
            count += 1
            
    # If the rule meets the minimum support threshold, calculate the confidence
    if count >= min_support:
        confidence = count / len([transaction for transaction in transactions if antecedent in transaction])
        rules.append((antecedent, consequent, count, confidence))

# Print the resulting association rules
for rule in rules:
    antecedent = rule[0]
    consequent = rule[1]
    count = rule[2]
    confidence = rule[3]
    print("Rule: " + antecedent + " -> " + consequent)
    print("Support: " + str(count))
    print("Confidence: " + str(confidence))
    print("=====================================")


Rule: t ->  
Support: 13195
Confidence: 0.7273579185270933
Rule: t -> e
Support: 12200
Confidence: 0.6725097844661264
Rule: t -> h
Support: 4164
Confidence: 0.22953530676368447
Rule: t -> k
Support: 1909
Confidence: 0.10523124414310127
Rule: t -> u
Support: 7459
Confidence: 0.4111680723223637
Rule: t -> I
Support: 60
Confidence: 0.00330742516950554
Rule: t -> r
Support: 14922
Confidence: 0.8225566396560278
Rule: t -> b
Support: 7923
Confidence: 0.4367454936332065
Rule: t -> c
Support: 6232
Confidence: 0.34353122760597543
Rule: t -> .
Support: 29
Confidence: 0.0015985888319276775
Rule: t -> y
Support: 3564
Confidence: 0.19646105506862907
Rule: t -> l
Support: 9674
Confidence: 0.5332671848299432
Rule: t -> n
Support: 3107
Confidence: 0.1712695000275619
Rule: t -> d
Support: 4718
Confidence: 0.2600738658287856
Rule: t -> j
Support: 518
Confidence: 0.028554103963397828
Rule: t -> p
Support: 4823
Confidence: 0.2658618598754203
Rule: t -> v
Support: 4268
Confidence: 0.2352681770574941
Rule: 

In [21]:
import itertools

# load the dataframe


# convert the column containing lists of items into a list of lists
transactions = data['item'].tolist()

# convert the items in the list to string format
transactions = [[str(item) for item in sublist] for sublist in transactions]

# Find all unique items in the dataframe
unique_items = set([item for sublist in transactions for item in sublist])

# Set a minimum support threshold
min_support = 20

# Generate all possible rules with a length of 2
possible_rules = list(itertools.combinations(unique_items, 2))

# Create an empty list to store the rules that meet the minimum support threshold
rules = []

# Iterate through the possible rules
for rule in possible_rules:
    antecedent = rule[0]
    consequent = rule[1]
    count = 0
    
    # Count the number of transactions that contain the rule
    for transaction in transactions:
        if (antecedent in transaction) and (consequent in transaction):
            count += 1
            
    # If the rule meets the minimum support threshold, calculate the confidence
    if count >= min_support:
        confidence = count / len([transaction for transaction in transactions if antecedent in transaction])
        rules.append((antecedent, consequent, count, confidence))

# Print the resulting association rules
for rule in rules:
    antecedent = rule[0]
    consequent = rule[1]
    count = rule[2]
    confidence = rule[3]
    print("Rule: " + antecedent + " -> " + consequent)
    print("Support: " + str(count))
    print("Confidence: " + str(confidence))
    print("=====================================")


Rule: frozen meals -> bottled water
Support: 60
Confidence: 0.24489795918367346
Rule: frozen meals -> butter
Support: 38
Confidence: 0.15510204081632653
Rule: frozen meals -> newspapers
Support: 41
Confidence: 0.1673469387755102
Rule: frozen meals -> brown bread
Support: 43
Confidence: 0.17551020408163265
Rule: frozen meals -> frankfurter
Support: 35
Confidence: 0.14285714285714285
Rule: frozen meals -> root vegetables
Support: 57
Confidence: 0.23265306122448978
Rule: frozen meals -> pip fruit
Support: 46
Confidence: 0.18775510204081633
Rule: frozen meals -> sausage
Support: 62
Confidence: 0.2530612244897959
Rule: frozen meals -> yogurt
Support: 81
Confidence: 0.3306122448979592
Rule: frozen meals -> citrus fruit
Support: 56
Confidence: 0.22857142857142856
Rule: frozen meals -> beef
Support: 31
Confidence: 0.12653061224489795
Rule: frozen meals -> frozen vegetables
Support: 34
Confidence: 0.13877551020408163
Rule: frozen meals -> sugar
Support: 22
Confidence: 0.08979591836734693
Rule: 

In [26]:
from apyori import apriori

# load the dataframe


# convert the column containing lists of items into a list of lists
transactions = data['item'].tolist()

# use the apriori function from the apyori library to find association rules
results = list(apriori(transactions, min_support=0.5, min_confidence=0.8))

# print the resulting association rules
for item in results:
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    print("Support: " + str(item[1]))

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")


### Hazırlanmış verisetinin kaydedilmesi

data.to_csv("item_data.csv", index=False)

# Brute force mantığı
-Önce olası tüm item kombinasyonları belirlenmeli

bir perakende mağazasının satış verilerinde elma öğesi 1000 kez görüldüyse, elma öğesi için support değeri 1000 olur.

"eğer bir müşteri elma satın alırsa, o müşterinin portakal da satın alma olasılığı yüksektir" gibi bir ilişki kuralında, confident değeri, elma satın alan müşterilerin portakal da satın alma olasılığını gösterir. Confident değeri, 0 ile 1 arasında bir değer alır, ve 1'e ne kadar yakınsarsa ilişki kuralının doğruluğu o kadar yüksektir.

In [51]:
from itertools import chain, combinations

# Veri kümesi
data = [('MILK', 'BREAD', 'BISCUIT'),        ('BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'),        ('BREAD', 'TEA', 'BISCUIT'),        ('JAM', 'BREAD', 'TEA', 'MILK'),        ('TEA', 'MILK', 'BISCUIT')]

# Öğe listelerini oluştur
item_lists = []
for basket in data:
  item_lists.append(list(basket))

# Tüm olası öğe kümelerini oluştur
item_sets = []
for items in item_lists:
  for i in range(1, len(items)+1):
    item_sets.extend(list(combinations(items, i)))

# Öğe kümelerinin sıklıklarını say
frequencies = {}
for item_set in item_sets:
  if item_set in frequencies:
    frequencies[item_set] += 1
  else:
    frequencies[item_set] = 1

# En sık tekrar eden öğe kümelerini bul
max_frequency = max(frequencies.values())
frequent_itemsets = [item_set for item_set, frequency in frequencies.items() if frequency == max_frequency]

# Sonuçları yazdır
print("En sık tekrar eden öğe kümeleri:")
for itemset in frequent_itemsets:
  print(itemset)


En sık tekrar eden öğe kümeleri:
('MILK',)
('BREAD',)
('BISCUIT',)


In [53]:
baskets = []

for b in range(1, 101):
    basket = []
    for i in range(1, 101):
        if b % i == 0:
            basket.append(i)
            baskets.append(basket)

baskets


[[1],
 [1, 2],
 [1, 2],
 [1, 3],
 [1, 3],
 [1, 2, 4],
 [1, 2, 4],
 [1, 2, 4],
 [1, 5],
 [1, 5],
 [1, 2, 3, 6],
 [1, 2, 3, 6],
 [1, 2, 3, 6],
 [1, 2, 3, 6],
 [1, 7],
 [1, 7],
 [1, 2, 4, 8],
 [1, 2, 4, 8],
 [1, 2, 4, 8],
 [1, 2, 4, 8],
 [1, 3, 9],
 [1, 3, 9],
 [1, 3, 9],
 [1, 2, 5, 10],
 [1, 2, 5, 10],
 [1, 2, 5, 10],
 [1, 2, 5, 10],
 [1, 11],
 [1, 11],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 2, 3, 4, 6, 12],
 [1, 13],
 [1, 13],
 [1, 2, 7, 14],
 [1, 2, 7, 14],
 [1, 2, 7, 14],
 [1, 2, 7, 14],
 [1, 3, 5, 15],
 [1, 3, 5, 15],
 [1, 3, 5, 15],
 [1, 3, 5, 15],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 2, 4, 8, 16],
 [1, 17],
 [1, 17],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 2, 3, 6, 9, 18],
 [1, 19],
 [1, 19],
 [1, 2, 4, 5, 10, 20],
 [1, 2, 4, 5, 10, 20],
 [1, 2, 4, 5, 10, 20],
 [1, 2, 4, 5, 10, 20],
 [1, 2,