In [None]:
import pandas as pd
import numpy as np
import random
from mlxtend.frequent_patterns import apriori, association_rules
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split

In [20]:
#1.
faceplate = pd.read_csv('Faceplate.csv')
faceplate.columns = faceplate.columns.str.strip()
if "Transaction" in faceplate.columns:
    faceplate = faceplate.drop(columns=["Transaction"])
faceplate = faceplate.apply(pd.to_numeric, errors="coerce").fillna(0)
faceplate = (faceplate > 0).astype(int)
print("First 10 Transactions:")
print(faceplate.head(10))
support_count = ((faceplate["Red"] == 1) & (faceplate["White"] == 1)).sum()
total_transactions = len(faceplate)
support = support_count / total_transactions

print("\nSupport of {Red, White}:")
print("Support Count:", support_count)
print("Support Value:", support)

First 10 Transactions:
   Red  White  Blue  Orange  Green  Yellow
0    1      1     0       0      1       0
1    0      1     0       1      0       0
2    0      1     1       0      0       0
3    1      1     0       1      0       0
4    1      0     1       0      0       0
5    0      1     1       0      0       0
6    1      0     1       0      0       0
7    1      1     1       0      1       0
8    1      1     1       0      0       0
9    0      0     0       0      0       1

Support of {Red, White}:
Support Count: 4
Support Value: 0.4


In [22]:
#2.
data = pd.read_csv("Faceplate.csv")
data.columns = data.columns.str.strip()
if "Transaction" in data.columns:
    data = data.drop(columns=["Transaction"])
data = data.apply(pd.to_numeric, errors="coerce").fillna(0)
data = (data > 0).astype(int)
basket = data.astype(bool)
print("First five Transactions:")
print(basket.head())

# 2.1 
frequent_itemsets = apriori(basket, min_support=0.2, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False).reset_index(drop=True)

print("\nFrequent Itemsets (Support >= 0.2):")
print(frequent_itemsets)

# 2.2 
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules_sorted = rules.sort_values(by="lift", ascending=False).reset_index(drop=True)

print("\nAssociation Rules (Confidence >= 0.5), Sorted by Lift:")
print(rules_sorted)

# 2.3
top6 = rules_sorted.head(6).copy()
top6 = top6.drop(columns=[c for c in ["antecedent support", "consequent support", "conviction"] if c in top6.columns])
top6 = top6[["antecedents", "consequents", "support", "confidence", "lift", "leverage"]]
top6_rules = top6
print("\nTop 6 Rules by Lift:")
print(top6_rules)
# 2.4
best_rule = rules_sorted.iloc[0]
ante = ", ".join(sorted(list(best_rule["antecedents"])))
cons = ", ".join(sorted(list(best_rule["consequents"])))
conf_pct = best_rule["confidence"] * 100
lift_val = best_rule["lift"]
sentence = (
    f"If [{ante}] are purchased, then with confidence {conf_pct:.1f}% "
    f"[{cons}] will also be purchased. This rule has a lift ratio of {lift_val:.3f}."
)

print("\nInterpretation of Highest Lift Rule:")
print(sentence)
rule_sorted = rules_sorted
support_value = None  

First five Transactions:
     Red  White   Blue  Orange  Green  Yellow
0   True   True  False   False   True   False
1  False   True  False    True  False   False
2  False   True   True   False  False   False
3   True   True  False    True  False   False
4   True  False   True   False  False   False

Frequent Itemsets (Support >= 0.2):
    support             itemsets
0       0.7              (White)
1       0.6                (Red)
2       0.6               (Blue)
3       0.4          (Red, Blue)
4       0.4         (Red, White)
5       0.4        (Blue, White)
6       0.2             (Orange)
7       0.2              (Green)
8       0.2         (Red, Green)
9       0.2      (Orange, White)
10      0.2       (Green, White)
11      0.2   (Red, Blue, White)
12      0.2  (Red, Green, White)

Association Rules (Confidence >= 0.5), Sorted by Lift:
       antecedents   consequents  antecedent support  consequent support  \
0     (Red, White)       (Green)                 0.4                

In [23]:
#3.
# 3.1 
df = pd.read_csv("CharlesBookClub.csv")
df.columns = df.columns.str.strip()
drop_targets = {"seq#", "id#", "gender", "m", "r", "f", "firstpurch", "related purchase"}
cols_to_drop = []
for c in df.columns:
    c_norm = c.strip().lower()
    if c_norm in drop_targets or "code" in c_norm:
        cols_to_drop.append(c)

df_books = df.drop(columns=cols_to_drop, errors="ignore")
df_books = df_books.select_dtypes(include="number")
binary_matrix = (df_books > 0).astype(int)

print("First 10 rows of Binary Incidence Matrix:")
print(binary_matrix.head(10))

# 3.2 
min_support_value = 200 / len(binary_matrix)

frequent_itemsets_books = apriori(binary_matrix.astype(bool),
                            min_support=min_support_value,
                            use_colnames=True)

print("\nNumber of Frequent Itemsets Found:")
print(len(frequent_itemsets))

# 3.3 
rules_books = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

top25_books = rules_books.sort_values(by="lift", ascending=False).head(25).copy()
top25_books = top25_books[["antecedents", "consequents", "support", "confidence", "lift", "leverage"]]

print("\nTop 25 Rules by Lift:")
print(top25_books)

First 10 rows of Binary Incidence Matrix:
   ChildBks  YouthBks  CookBks  DoItYBks  RefBks  ArtBks  GeogBks  ItalCook  \
0         0         1        1         0       0       0        0         0   
1         0         0        0         0       0       0        0         0   
2         1         1        1         0       1       0        1         1   
3         0         0        0         0       0       0        0         0   
4         0         0        0         0       0       0        0         0   
5         0         0        0         0       0       0        0         0   
6         0         0        0         0       0       0        1         0   
7         1         0        0         0       0       0        0         0   
8         0         0        0         0       0       0        0         0   
9         0         0        1         0       0       0        0         0   

   ItalAtlas  ItalArt  Florence  Yes_Florence  No_Florence  
0          0        0      

In [17]:

# QUESTION 4 
#4.1
rule_highest_support = rules_sorted.sort_values(
    by="support",
    ascending=False
).iloc[0]

print("4.1 Rule with Highest Support:\n")
print("Antecedents:", ', '.join(list(rule_highest_support['antecedents'])))
print("Consequents:", ', '.join(list(rule_highest_support['consequents'])))
print("Support:", rule_highest_support['support'])
print("Confidence:", rule_highest_support['confidence'])
print("Lift:", rule_highest_support['lift'])



# 4.2 
rule_highest_lift = rules_sorted.iloc[0]

print("\n4.2 Rule with Highest Lift:\n")
print("Antecedents:", ', '.join(list(rule_highest_lift['antecedents'])))
print("Consequents:", ', '.join(list(rule_highest_lift['consequents'])))
print("Support:", rule_highest_lift['support'])
print("Confidence:", rule_highest_lift['confidence'])
print("Lift:", rule_highest_lift['lift'])

print("\nComparison:")
print("Support (Highest Lift Rule):", rule_highest_lift['support'])
print("Support (Highest Support Rule):", rule_highest_support['support'])


# 4.3 
top10_lift = rules_sorted.head(10)
lowest_conf_rule = top10_lift.sort_values(by="confidence").iloc[0]

print("\n4.3 Rule with Lowest Confidence Among Top 10 Lift Rules:\n")
print("Antecedents:", ', '.join(list(lowest_conf_rule['antecedents'])))
print("Consequents:", ', '.join(list(lowest_conf_rule['consequents'])))
print("Support:", lowest_conf_rule['support'])
print("Confidence:", lowest_conf_rule['confidence'])
print("Lift:", lowest_conf_rule['lift'])

4.1 Rule with Highest Support:

Antecedents: fcode
Consequents: mcode, rcode
Support: 1.0
Confidence: 1.0
Lift: 1.0

4.2 Rule with Highest Lift:

Antecedents: yes_florence
Consequents: florence
Support: 0.0845
Confidence: 1.0
Lift: 11.834319526627219

Comparison:
Support (Highest Lift Rule): 0.0845
Support (Highest Support Rule): 1.0

4.3 Rule with Lowest Confidence Among Top 10 Lift Rules:

Antecedents: yes_florence
Consequents: florence
Support: 0.0845
Confidence: 1.0
Lift: 11.834319526627219


In [None]:


#5.1
random.seed(0)

n_transactions = 50
items = [f"item{i}" for i in range(1, 10)]  
data = []
for _ in range(n_transactions):
    row = [1 if random.random() < 0.5 else 0 for _ in items]
    data.append(row)

basket = pd.DataFrame(data, columns=items).astype(bool)

print("5.1 Binary incidence matrix (first 10 rows):")
display(basket.head(10))


# 5.2 

min_support = 2 / n_transactions   
min_conf = 0.7

freq_itemsets = apriori(basket, min_support=min_support, use_colnames=True)
freq_itemsets = freq_itemsets.sort_values("support", ascending=False).reset_index(drop=True)

print(f"\n5.2 Frequent itemsets found (min_support={min_support:.2f}): {len(freq_itemsets)}")
display(freq_itemsets)

rules = association_rules(freq_itemsets, metric="confidence", min_threshold=min_conf)

if rules.empty:
    print(f"\nNo rules met confidence >= {min_conf}. Try lowering confidence or increasing random density.")
else:

    # 5.3 

    rules_sorted = rules.sort_values(by="lift", ascending=False).reset_index(drop=True)

    top6 = rules_sorted.head(6).copy()
    top6 = top6[['antecedents','consequents','support','confidence','lift']]

    top6['antecedents'] = top6['antecedents'].apply(lambda s: ', '.join(sorted(list(s))))
    top6['consequents'] = top6['consequents'].apply(lambda s: ', '.join(sorted(list(s))))

    print("\n5.3 Top 6 rules by uplift (lift):")
    display(top6)

    max_lift = rules_sorted['lift'].max()
    max_lift_support = rules_sorted.loc[rules_sorted['lift'].idxmax(), 'support']

    print(f"\nHighest uplift (lift) observed: {max_lift:.3f}")
    print(f"Support of that highest-lift rule: {max_lift_support:.3f}")

    if max_lift >= 2.0:
        print("\nConclusion: Yes evn with random data, sometimes you can  see very high uplift (lift) rules.")
        print("This often happens when support is low: afew coincidences can inflate lift.")
    else:
        print("\nConclusion: In this run, uplift values are not extremely high.")
        print("Random data typically produces some rules, but very high lift is less common unless low-support coincidences occur.")

5.1 Binary incidence matrix (first 10 rows):


Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,item8,item9
0,False,False,True,True,False,True,False,True,True
1,False,False,False,True,False,False,True,False,False
2,False,False,True,False,False,False,True,True,True
3,False,False,False,True,False,True,False,False,True
4,False,True,False,False,True,True,False,True,True
5,False,True,False,True,False,False,True,True,True
6,False,False,True,False,False,False,False,False,False
7,False,False,True,False,True,False,True,True,True
8,False,False,True,True,False,False,False,False,False
9,False,False,True,False,True,False,False,False,False



5.2 Frequent itemsets found (min_support=0.04): 297


Unnamed: 0,support,itemsets
0,0.58,(item8)
1,0.56,(item4)
2,0.50,(item3)
3,0.48,(item9)
4,0.48,(item6)
...,...,...
292,0.04,"(item6, item3, item4, item5, item1, item8)"
293,0.04,"(item6, item3, item4, item5, item9, item1)"
294,0.04,"(item3, item4, item5, item9, item1, item8)"
295,0.04,"(item6, item4, item5, item9, item1, item8)"



5.3 Top 6 rules by uplift (lift):


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,"item1, item2, item8, item9","item4, item5",0.04,1.0,4.545455
1,"item2, item4, item5, item9","item1, item8",0.04,1.0,4.545455
2,"item1, item3, item8, item9","item4, item5",0.04,1.0,4.545455
3,"item3, item5, item6, item9","item1, item4",0.04,1.0,4.166667
4,"item3, item5, item6, item8","item1, item4",0.04,1.0,4.166667
5,"item1, item3, item7, item8","item2, item4",0.04,1.0,3.846154



Highest uplift (lift) observed: 4.545
Support of that highest-lift rule: 0.040

Conclusion: Yes — even with random data, you can sometimes see very high uplift (lift) rules.
This often happens when support is low: a few coincidences can inflate lift.


In [None]:

# Q6: 
# 6.1 Create synthetic ratings
np.random.seed(0)

n_ratings = 5000
n_users = 1000  
n_items = 100    

df = pd.DataFrame({
    "itemID": np.random.randint(0, n_items, size=n_ratings),
    "userID": np.random.randint(0, n_users, size=n_ratings),
    "rating": np.random.randint(1, 6, size=n_ratings)  
})

print("6.1 First 10 rows of the synthetic dataset:")
display(df.head(10))
# 6.2 
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=0)

print("\n6.2 Dimensions:")
print("Trainset -> users:", trainset.n_users, "items:", trainset.n_items, "ratings:", trainset.n_ratings)
print("Testset  -> number of rows:", len(testset))

# 6.3 

# (A) 
sim_user = {"name": "cosine", "user_based": True}
user_model = KNNWithMeans(sim_options=sim_user)
user_model.fit(trainset)

# (B) 
sim_item = {"name": "cosine", "user_based": False}
item_model = KNNWithMeans(sim_options=sim_item)
item_model.fit(trainset)

print("\n6.3 Models fitted:")
print("- User-based cosine similarity model fitted (optional check).")
print("- Item-based cosine similarity model fitted (main model).")

# 6.4

anti_testset = trainset.build_anti_testset()
predictions = item_model.test(anti_testset)  
from collections import defaultdict

N = 5
top_n = defaultdict(list)

for uid, iid, true_r, est, details in predictions:
    top_n[uid].append((iid, est))

for uid in top_n:
    top_n[uid].sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = top_n[uid][:N]
print(f"\n6.4 Top-{N} recommended items for the first 10 users (userID 0..9):\n")
for uid in range(10):
    recs = top_n.get(str(uid), [])
    print(f"User {uid}: " + ", ".join([f"item{iid} (pred={est:.2f})" for iid, est in recs]))

6.1 First 10 rows of the synthetic dataset:


Unnamed: 0,itemID,userID,rating
0,44,187,3
1,47,507,3
2,64,493,2
3,67,183,1
4,67,893,3
5,9,673,4
6,83,267,3
7,21,639,1
8,36,987,2
9,87,802,1


NameError: name 'Reader' is not defined