<a href="https://colab.research.google.com/github/sanimmazhit/market-basket-analysis/blob/main/notebooks/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import os
import pandas as pd

# Kaggle credentials (use your own for local testing)
os.environ['KAGGLE_USERNAME'] = "xxxxxx"
os.environ['KAGGLE_KEY'] = "xxxxxx"

# Download dataset from Kaggle
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

# Unzip into 'data' folder
!unzip -o amazon-books-reviews.zip -d data/


Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 98% 1.04G/1.06G [00:05<00:00, 147MB/s]
100% 1.06G/1.06G [00:05<00:00, 198MB/s]
Archive:  amazon-books-reviews.zip
  inflating: data/Books_rating.csv   
  inflating: data/books_data.csv     


In [2]:
# Load the dataset
df = pd.read_csv('data/Books_rating.csv')

# Use a subsample while testing to avoid Colab crashes
USE_SUBSAMPLE = True

if USE_SUBSAMPLE:
    df = df.sample(frac=0.05, random_state=42)  # 5% of the data

# Quick look at the data
df.head()


Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
2945667,B0006CR6U4,"A dictionary of the Targumim, the Talmud Babli...",,A303XPDO694V6X,Ariel,2/6,4.0,1122163200,Jastrow,Jastrow made a great workthis dictionary can h...
2352586,0897166159,Espresso Coffee: Professional Techniques,,A3780H4TM9RMB8,David barnes,0/1,2.0,1356912000,NOT the book,Extremely disappointed by the SHORT length and...
1531260,0736693408,The First King of Shannara (The Sword of Shann...,,A1AX6VPDQQZDPV,M Carlton,4/4,5.0,1105574400,Great (what do you expect?),"This, like all of Brook's Shannara series book..."
941910,0395051029,Wuthering Heights (Riverside editions),,A35RQKCCCQ62O0,LadyJ,0/0,4.0,1353888000,Satisfied,I enjoyed this classic. I didn't know the stor...
2582125,4770016050,"A Cat, a Man, and Two Women (Japans Modern Wri...",,A2IJQDE1I4SIJT,"David C. Arnold ""master D""",1/2,5.0,1167955200,"Ordered 09/02/2006, still on backorder",I would love to read this book. Have accepted ...


In [3]:
import nltk
import string

# Download stopwords list (only needed once)
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define stopwords and punctuation
stop_words = set(stopwords.words('english'))
punct = set(string.punctuation)

# Basic text cleaning function
def clean_review(text):
    if pd.isnull(text):
        return []
    tokens = text.lower().split()
    tokens = [word.strip("".join(punct)) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return tokens

# Apply cleaning to the 'review/text' column
df['cleaned_tokens'] = df['review/text'].apply(clean_review)

# Preview results
df[['review/text', 'cleaned_tokens']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,review/text,cleaned_tokens
2945667,Jastrow made a great workthis dictionary can h...,"[jastrow, made, great, workthis, dictionary, h..."
2352586,Extremely disappointed by the SHORT length and...,"[extremely, disappointed, short, length, curso..."
1531260,"This, like all of Brook's Shannara series book...","[like, brook's, shannara, series, books, wonde..."
941910,I enjoyed this classic. I didn't know the stor...,"[enjoyed, classic, know, story, prior, reading..."
2582125,I would love to read this book. Have accepted ...,"[would, love, read, book, accepted, slip, deli..."


In [4]:
# Create word baskets (one basket per review)
word_baskets = df['cleaned_tokens'].tolist()

# Remove empty baskets
word_baskets = [basket for basket in word_baskets if len(basket) > 0]

# Preview a few baskets
word_baskets[:5]


[['jastrow',
  'made',
  'great',
  'workthis',
  'dictionary',
  'help',
  'learn',
  'aramaic',
  'textbut',
  'case',
  'help',
  'lot',
  'learn',
  'talmud.it',
  'difficult',
  'thread',
  "one's",
  'way',
  'massive',
  'pages',
  'unless',
  'assisted',
  'helpful',
  'experts',
  'contrive',
  'systematic',
  'directives',
  'make',
  'handles',
  'torah',
  'meet',
  'varied',
  'approaches',
  'seek',
  'know',
  'wisdom',
  'doctrine',
  'laws',
  'poetry',
  'folklore',
  'even',
  'apparent',
  'trivia',
  'well-known',
  'midrash',
  'comes',
  'mind.king',
  'solomon',
  'applied',
  'wisdom',
  'prudence',
  'help',
  'students',
  'find',
  'way',
  'intricacies',
  'torah.he',
  'like',
  'clever',
  'man',
  'parable',
  'large',
  'palace',
  'many',
  'doors',
  'man',
  'would',
  'enter',
  'become',
  'confused',
  'find',
  'door',
  'entered.the',
  'clever',
  'man',
  'took',
  'clew',
  'rope',
  'suspended',
  'door',
  'entry',
  'could',
  'serve',
  '

In [1]:
!pip install mlxtend

from mlxtend.preprocessing import TransactionEncoder

# Encode the baskets
te = TransactionEncoder()
te_ary = te.fit(word_baskets).transform(word_baskets)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Preview encoded data
df_encoded.head()



NameError: name 'word_baskets' is not defined

In [None]:
from mlxtend.frequent_patterns import apriori

# Run Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Sort by support
frequent_itemsets.sort_values(by="support", ascending=False).head(10)
