<a href="https://colab.research.google.com/github/sanimmazhit/market-basket-analysis/blob/main/notebooks/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import os

os.environ['KAGGLE_USERNAME'] = "***"
os.environ['KAGGLE_KEY'] = "***"

!kaggle datasets download -d mohamedbakhet/amazon-books-reviews
!unzip -n amazon-books-reviews.zip

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
amazon-books-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  amazon-books-reviews.zip
  inflating: Books_rating.csv        
  inflating: books_data.csv          


In [25]:
import os
os.makedirs('data', exist_ok=True)
os.makedirs('images', exist_ok=True)


In [26]:
# Setup project folders for GitHub and saving outputs

# Create required folders
import os

os.makedirs("data", exist_ok=True)
os.makedirs("images", exist_ok=True)

# Create .gitkeep files to ensure GitHub tracks the folders
with open("data/.gitkeep", "w") as f:
    pass
with open("images/.gitkeep", "w") as f:
    pass


In [27]:
# Move CSV files into data/ folder
!mkdir -p data
!mv Books_rating.csv data/
!mv books_data.csv data/

In [28]:
import pandas as pd

# Load only review text column
df = pd.read_csv("data/Books_rating.csv", usecols=["review/text"])

# Drop missing or duplicate reviews
df = df.dropna().drop_duplicates(subset=["review/text"])

# Filter out very short or very long reviews
df["review_length"] = df["review/text"].apply(lambda x: len(str(x).split()))
df = df[(df["review_length"] >= 10) & (df["review_length"] <= 100)]

# Use only 1% for initial processing (safe)
df = df.sample(frac=0.01, random_state=42).reset_index(drop=True)


## Part 1: Word-based Market Basket Analysis

In [29]:
import nltk
import re
import string

nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

df["cleaned_tokens"] = df["review/text"].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
from collections import Counter

# Flatten and count all tokens
all_words = [word for tokens in df["cleaned_tokens"] for word in tokens]
common_words = set([w for w, c in Counter(all_words).most_common(1000)])

# Filter to keep only common words
df["cleaned_tokens"] = df["cleaned_tokens"].apply(lambda tokens: [w for w in tokens if w in common_words])

# Drop empty baskets
df = df[df["cleaned_tokens"].str.len() > 0].reset_index(drop=True)


In [31]:
word_baskets = df["cleaned_tokens"].tolist()


In [32]:
!pip install mlxtend
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(word_baskets).transform(word_baskets)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)



In [None]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
frequent_itemsets.sort_values(by="support", ascending=False).head(10)


In [None]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules.sort_values(by="confidence", ascending=False).head(10)

In [None]:
import os
os.makedirs("images", exist_ok=True)

In [None]:
import matplotlib.pyplot as plt
import os

# ✅ Make sure the 'images' folder exists
os.makedirs("images", exist_ok=True)

# Filter itemsets with exactly 2 items
frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(lambda x: len(x))
top_itemsets = frequent_itemsets[frequent_itemsets["length"] == 2]
top_itemsets = top_itemsets.sort_values(by="support", ascending=False).head(10)

# Create readable itemset names
top_itemsets["itemset_str"] = top_itemsets["itemsets"].apply(lambda x: ", ".join(sorted(x)))

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_itemsets["itemset_str"], top_itemsets["support"], color="skyblue")
plt.xlabel("Support")
plt.title("Top 10 Frequent Word Pairs")
plt.gca().invert_yaxis()
plt.tight_layout()

# ✅ Save the plot to the 'images/' folder
plt.savefig("images/frequent_word_pairs.png")
plt.show()


In [None]:
frequent_itemsets.to_csv("data/frequent_itemsets.csv", index=False)
rules.to_csv("data/association_rules.csv", index=False)

## Part 2: The set of books reviewed by same user

In [None]:
df_rating = pd.read_csv("data/Books_rating.csv")
print(df_rating.columns.tolist())


In [None]:
# Create baskets of books reviewed by the same user
# Load user-book data
df_users = pd.read_csv("data/Books_rating.csv", usecols=["User_id", "Title"])

# Drop missing or duplicate user-book entries
df_users = df_users.dropna().drop_duplicates()

# Group titles reviewed by each user
user_baskets = df_users.groupby("User_id")["Title"].apply(list).tolist()

# Remove baskets with less than 2 books (not useful for rules)
user_baskets = [basket for basket in user_baskets if len(basket) > 1]


In [None]:
from mlxtend.preprocessing import TransactionEncoder

# Encode baskets
te_user = TransactionEncoder()
te_user_ary = te_user.fit(user_baskets).transform(user_baskets)
df_user_encoded = pd.DataFrame(te_user_ary, columns=te_user.columns_)

# Find frequent book itemsets
frequent_books = apriori(df_user_encoded, min_support=0.005, use_colnames=True)
frequent_books = frequent_books.sort_values(by="support", ascending=False)
frequent_books.head(10)


In [None]:
rules_books = association_rules(frequent_books, metric="confidence", min_threshold=0.3)
rules_books = rules_books.sort_values(by="confidence", ascending=False)
rules_books.head(10)


In [None]:
# Filter top 10 frequent book pairs
frequent_books["length"] = frequent_books["itemsets"].apply(lambda x: len(x))
top_books = frequent_books[frequent_books["length"] == 2].head(10)
top_books["itemset_str"] = top_books["itemsets"].apply(lambda x: ", ".join(sorted(x)))

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_books["itemset_str"], top_books["support"], color="salmon")
plt.xlabel("Support")
plt.title("Top 10 Frequent Book Pairs Reviewed by Same Users")
plt.gca().invert_yaxis()
plt.tight_layout()

# Save image
plt.savefig("images/frequent_book_pairs.png")
plt.show()


In [None]:
frequent_books.to_csv("data/frequent_books.csv", index=False)
rules_books.to_csv("data/association_rules_books.csv", index=False)
