In [29]:
import pandas as pd
import numpy as np

# Load the data

df = pd.read_csv('Health_Supplements.csv')

## Pre-processing the Data

In [30]:
df.shape

(18668, 17)

In [31]:
# Dropping NA values

df = df.dropna().reset_index()

df.isna().sum()

index                 0
Unnamed: 0            0
Company               0
Product               0
Rating                0
Rating Counts         0
Price                 0
Product Link          0
WebSite               0
Type of supplement    0
Num Quantity          0
Units Values          0
Category              0
Consumer              0
Flavour               0
Amount                0
Per Unit Price        0
Vegetarian            0
dtype: int64

In [32]:
df.shape

(18307, 18)

In [33]:
# Dropping index and unnamed:0 columns

df.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)



In [34]:
# Drop duplicate products from product column

df = df.drop_duplicates(subset=['Product'], keep='first').reset_index(drop = True)


In [37]:
df.head()

Unnamed: 0,Company,Product,Rating,Rating Counts,Price,Product Link,WebSite,Type of supplement,Num Quantity,Units Values,Category,Consumer,Flavour,Amount,Per Unit Price,Vegetarian
0,Walgreens,Walgreens Women's Multivitamin Gummies Natural...,4.0,58,13.99,https://walgreens.com/store/c/walgreens-women'...,Walgreens,Multivitamins,150,Tablets,whole_food_multivitamins,general,berry,150,0.09,general
1,Walgreens,Walgreens Adult Multivitamin Tablets,4.5,49,14.99,https://walgreens.com/store/c/walgreens-adult-...,Walgreens,Multivitamins,200,Tablets,age_specific_multivitamins,general,general,200,0.07,general
2,Walgreens,"Walgreens Men 50+ Multivitamin Tablets, Gluten...",4.5,84,15.99,https://walgreens.com/store/c/walgreens-men-50...,Walgreens,Multivitamins,200,Tablets,general,Men,general,200,0.08,general
3,Nature Made,Nature Made Multivitamin For Her Tablets,4.6,143,13.99,https://walgreens.com/store/c/nature-made-mult...,Walgreens,Multivitamins,90,Tablets,general,general,general,90,0.16,general
4,Walgreens,Walgreens Women's 50+ Multivitamin Tablets,4.5,21,15.99,https://walgreens.com/store/c/walgreens-women'...,Walgreens,Multivitamins,200,Tablets,general,general,general,200,0.08,general


In [22]:
df["Type of supplement"].unique()

array(['Multivitamins', 'Omega 3', 'Probiotic', 'Whey Protein',
       'Creatine'], dtype=object)

## Feature Engineering

In [23]:
# Features that can influence the recommendation engine:

feature_vector = ["Type of supplement", "Consumer", "Flavour", "Category", "Amount", "Vegetarian"]

df['combined_features'] = df[feature_vector].apply(lambda x: ' '.join(x), axis=1)


df["combined_features"]

0       Multivitamins general berry whole_food_multivi...
1       Multivitamins general general age_specific_mul...
2           Multivitamins Men general general 200 general
3        Multivitamins general general general 90 general
4       Multivitamins general general general 200 general
                              ...                        
4633    Whey Protein general   creamy vanilla general ...
4634    Whey Protein general general general 80 Vegeta...
4635    Whey Protein general   unsweetened general 10....
4636    Whey Protein general   simply natural original...
4637    Whey Protein general   vanilla creme general 3...
Name: combined_features, Length: 4638, dtype: object

## Vectorization

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfid_vec = TfidfVectorizer()

tfid_matrix = tfid_vec.fit_transform(df['combined_features'])

# Similarly Calcualtion

cosine_sim = linear_kernel(tfid_matrix, tfid_matrix)


## Recommendation Engine

In [26]:
# Step 5: Recommendation Generation
def get_recommendations(product_index, consumer_filter=None):
    sim_scores = list(enumerate(cosine_sim[product_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Get the top 5 similar products (excluding the product itself)
    product_indices = [i[0] for i in sim_scores]

    if consumer_filter:
        filtered_indices = [idx for idx in product_indices if df.iloc[idx]['Consumer'] == consumer_filter]
        return df.iloc[filtered_indices]
    else:
        return df.iloc[product_indices]

In [27]:

df.shape

(4638, 18)

In [38]:
get_recommendations(1)

Unnamed: 0,Company,Product,Rating,Rating Counts,Price,Product Link,WebSite,Type of supplement,Num Quantity,Units Values,Category,Consumer,Flavour,Amount,Per Unit Price,Vegetarian
77,Centrum,Centrum Adult Multivitamin & Multimineral Supp...,4.5,250,22.99,https://walgreens.com/store/c/centrum-adult-mu...,Walgreens,Multivitamins,200,Tablets,age_specific_multivitamins,general,general,200,0.11,general
4,Walgreens,Walgreens Women's 50+ Multivitamin Tablets,4.5,21,15.99,https://walgreens.com/store/c/walgreens-women'...,Walgreens,Multivitamins,200,Tablets,general,general,general,200,0.08,general
13,Walgreens,Walgreens Men's Multivitamin Tablets,4.0,54,15.99,https://walgreens.com/store/c/walgreens-men's-...,Walgreens,Multivitamins,200,Tablets,general,general,general,200,0.08,general
99,Walgreens,Walgreens Iron-Free Multivitamin Tablets,4.5,5,12.99,https://walgreens.com/store/c/walgreens-iron-f...,Walgreens,Multivitamins,200,Tablets,general,general,general,200,0.06,general
1129,One-A-Day,Men's Complete Multivitamin,4.7,2301,22.64,https://www.iherb.com/pr/one-a-day-men-s-compl...,iHerb,Multivitamins,200,Tablets,general,general,general,200,0.1132,general
