In [1]:
!wget 'https://media.githubusercontent.com/media/pradeep-016/ML_Course/refs/heads/main/7)%20Recommendation%20Systems/Content-Based%20Filtering/amazon_product.csv'

--2024-12-07 08:14:01--  https://media.githubusercontent.com/media/pradeep-016/ML_Course/refs/heads/main/7)%20Recommendation%20Systems/Content-Based%20Filtering/amazon_product.csv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22264 (22K) [text/plain]
Saving to: ‘amazon_product.csv’


2024-12-07 08:14:01 (2.98 MB/s) - ‘amazon_product.csv’ saved [22264/22264]



In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = pd.read_csv('amazon_product.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,asin,product_title,product_price,product_original_price,currency,product_star_rating,product_num_ratings,product_url,product_photo,...,is_best_seller,is_amazon_choice,is_prime,climate_pledge_friendly,sales_volume,delivery,has_variations,product_availability,unit_price,unit_count
0,0,B0BQ118F2T,Moto G Play 2023 3-Day Battery Unlocked Made f...,$99.99,$169.99,USD,4.0,2929,https://www.amazon.com/dp/B0BQ118F2T,https://m.media-amazon.com/images/I/61K1Fz5Lxv...,...,False,False,True,False,6K+ bought in past month,"FREE delivery Tue, Aug 6",True,,,
1,1,B0CTD47P22,"SAMSUNG Galaxy A15 5G (SM-156M/DSN), 128GB 6GB...",$149.74,$158.00,USD,4.2,135,https://www.amazon.com/dp/B0CTD47P22,https://m.media-amazon.com/images/I/51QhB2CfqS...,...,False,False,True,False,3K+ bought in past month,"FREE delivery Wed, Aug 7 Only 7 left in stock ...",False,Only 7 left in stock - order soon.,,
2,2,B0CHH6X6H2,Total by Verizon | Samsung Galaxy A03s | Locke...,$49.88,,USD,3.9,205,https://www.amazon.com/dp/B0CHH6X6H2,https://m.media-amazon.com/images/I/812woqv69C...,...,False,False,True,False,2K+ bought in past month,"FREE delivery Tue, Aug 6",False,,,
3,3,B0BZ9XNBRB,Google Pixel 7a - Unlocked Android Cell Phone ...,$335.00,$499.00,USD,4.3,2248,https://www.amazon.com/dp/B0BZ9XNBRB,https://m.media-amazon.com/images/I/61r7cCpQPl...,...,False,False,False,False,10K+ bought in past month,FREE delivery Aug 6 - 8,True,,,
4,4,B0CN1QSH8Q,"SAMSUNG Galaxy A15 5G A Series Cell Phone, 128...",$199.99,,USD,4.1,423,https://www.amazon.com/dp/B0CN1QSH8Q,https://m.media-amazon.com/images/I/61s0ZzwzSC...,...,False,False,True,True,3K+ bought in past month,"FREE delivery Tue, Aug 6",True,,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   64 non-null     int64  
 1   asin                         64 non-null     object 
 2   product_title                64 non-null     object 
 3   product_price                64 non-null     object 
 4   product_original_price       27 non-null     object 
 5   currency                     64 non-null     object 
 6   product_star_rating          54 non-null     float64
 7   product_num_ratings          64 non-null     int64  
 8   product_url                  64 non-null     object 
 9   product_photo                64 non-null     object 
 10  product_num_offers           64 non-null     int64  
 11  product_minimum_offer_price  64 non-null     object 
 12  is_best_seller               64 non-null     bool   
 13  is_amazon_choice      

In [5]:
column_todrop = ['Unnamed: 0','product_availability','unit_price','unit_count']
data = data.drop(columns = column_todrop)

In [6]:
category_mapping = {
    'samsung': 'electronics',
    'apple': 'electronics',
    'moto': 'electronics',
    'google': 'electronics',
    'amazon': 'books',
    'teacher': 'education',
    'xbox': 'gaming',
    'tracfone': 'electronics',
    'motorola': 'electronics',
    'hp': 'electronics',
    'vtech': 'electronics',
    'netflix': 'media',
    'disney': 'media',
    'thanksgiving': 'seasonal',
    'christmas': 'seasonal',
    'daybetter': 'lighting',
    'bounty': 'household',
}

In [7]:
def assign_category(title):
    for keyword, category in category_mapping.items():
        if keyword in title.lower():
            return category
    return 'others'

In [8]:
data['category'] = data['product_title'].apply(assign_category)

In [9]:
data.isna().sum()

Unnamed: 0,0
asin,0
product_title,0
product_price,0
product_original_price,37
currency,0
product_star_rating,10
product_num_ratings,0
product_url,0
product_photo,0
product_num_offers,0


In [10]:
def clean_price(price):
  if isinstance(price,str):
    return float( price.replace('$','').replace(',','').strip())
  return price

In [11]:
data['price'] = data['product_price'].copy()

In [12]:
data['product_price'] = data['product_price'].apply(clean_price)
data['product_original_price'] = data['product_original_price'].apply(clean_price)
data['product_minimum_offer_price'] = data['product_minimum_offer_price'].apply(clean_price)
data['price'] = data['product_price'].apply(clean_price)

In [13]:
median_rating = data['product_star_rating'].dropna().median() if data['product_star_rating'].notna().any() else 0

data['product_star_rating'] = data['product_star_rating'].fillna(median_rating)
data['sales_volume'] = data['sales_volume'].fillna("Unknown")
data['delivery'] = data['delivery'].fillna("Unknown")
data['product_original_price'] = data.groupby('category')['product_original_price'].transform(
    lambda x: x.fillna(x.median())
)

In [14]:
bool_cols = ['is_best_seller', 'is_amazon_choice', 'is_prime']

data[bool_cols] = data[bool_cols].astype(int)

In [15]:
scaler = MinMaxScaler()

numerical_features = ['product_price', 'product_original_price', 'product_minimum_offer_price', 'product_star_rating','product_num_ratings','product_num_offers']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [16]:
data['combined_features'] = data.apply(lambda x: f"{x['product_title']} {x['sales_volume']} {x['delivery']}"
                                       f"BestSeller:{x['is_best_seller']} Prime:{x['is_prime']}"
                                       f"StarRating:{x['product_star_rating']} Price:{x['product_price']}",
                                       axis = 1)

In [17]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

In [18]:
cosine = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
def recommendations(product_index, cosine_sim = cosine, top_n = 5):
  sim_scores = list(enumerate(cosine_sim[product_index]))
  sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
  top_indices = [i[0] for i in sim_scores[1:top_n+1]]
  recommendations = data.iloc[top_indices][['asin','product_title','price']]
  return recommendations

In [20]:
rec = recommendations(0)
print(rec)

          asin                                      product_title   price
9   B0CP6DDN1H  Moto G Play | 2024 | Unlocked | Made for US 4/...  129.99
12  B0D5T7318W  Bold K10 | 2024 | 3-Day Battery | Unlocked | 6...   99.99
14  B0BYGXFJ8K  Motorola Moto G Power 5G | 2023 | Unlocked | M...  199.99
7   B0CHH1N9VY  TracFone | Motorola Moto g Pure | Locked | 32G...   49.39
2   B0CHH6X6H2  Total by Verizon | Samsung Galaxy A03s | Locke...   49.88
