# Technical Assessment for a Full Stack AI/ML Engineer Role
## Task
*Develop a mini AI-based chatbot system that recommends products based on user queries.
The chatbot should be able to understand the user's text input, process it, and recommend a
list of products.*

# 1. Data Preparation:
*a. Given a mock dataset of products in a CSV format with the following fields:*
**i. Product ID
ii. Product Name
iii. Description
iv. Category
v. Price**
*b. Preprocess the dataset:*
**i. Handle missing values
ii. Tokenization of product descriptions**

In [301]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

In [302]:
df = pd.read_csv('mini-product-recommender-dataset.csv')
df.head()

Unnamed: 0,Product ID,Product Name,Description,Category,Price
0,1,Smartphone A,"Sleek design, 64GB storage, 12MP camera",Electronics,699.99
1,2,Laptop B,"15.6 inch, 8GB RAM, 256GB SSD",Electronics,999.99
2,3,Casual Shoes C,"Leather, size 10, brown",Fashion,79.99
3,4,Travel Mug D,"Stainless steel, 500ml",Home & Kitchen,14.99
4,5,Eau De Parfum E,"Floral scent, 100ml",Beauty,49.99


In [303]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Product ID    40 non-null     int64  
 1   Product Name  38 non-null     object 
 2   Description   36 non-null     object 
 3   Category      37 non-null     object 
 4   Price         36 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.7+ KB


In [304]:
df.isnull().sum()

Product ID      0
Product Name    2
Description     4
Category        3
Price           4
dtype: int64

In [305]:
df['Category'].value_counts()

Category
Electronics               8
Fashion                   7
Home & Kitchen            6
Beauty                    3
Books                     2
Grocery                   2
Sports                    2
Toys & Games              2
Movies & TV               1
Musical Instruments       1
Arts & Crafts             1
Health & Personal Care    1
Pets                      1
Name: count, dtype: int64

In [306]:
df['Category'].fillna('Others', inplace=True)

In [307]:
df.isnull().sum()

Product ID      0
Product Name    2
Description     4
Category        0
Price           4
dtype: int64

In [308]:
# drop rows with missing values
df.dropna(inplace=True)

In [309]:
df.isnull().sum()

Product ID      0
Product Name    0
Description     0
Category        0
Price           0
dtype: int64

In [310]:
df.isnull().sum()

Product ID      0
Product Name    0
Description     0
Category        0
Price           0
dtype: int64

In [311]:
# tokenize the product descriptions so that the model can understand the text based on user query
df['Tokenized_Description'] = df['Description'].apply(word_tokenize)
# convert all words to lowercase
df['Tokenized_Description'] = df['Tokenized_Description'].apply(lambda x: [word.lower() for word in x])
# remove , from the tokenized descriptions
df['Tokenized_Description'] = df['Tokenized_Description'].apply(lambda x: [word for word in x if word != ','])
df.head()


Unnamed: 0,Product ID,Product Name,Description,Category,Price,Tokenized_Description
0,1,Smartphone A,"Sleek design, 64GB storage, 12MP camera",Electronics,699.99,"[sleek, design, 64gb, storage, 12mp, camera]"
1,2,Laptop B,"15.6 inch, 8GB RAM, 256GB SSD",Electronics,999.99,"[15.6, inch, 8gb, ram, 256gb, ssd]"
2,3,Casual Shoes C,"Leather, size 10, brown",Fashion,79.99,"[leather, size, 10, brown]"
3,4,Travel Mug D,"Stainless steel, 500ml",Home & Kitchen,14.99,"[stainless, steel, 500ml]"
4,5,Eau De Parfum E,"Floral scent, 100ml",Beauty,49.99,"[floral, scent, 100ml]"


In [312]:
# save the processed data to a csv file and first row is header
df.to_csv('processed_data.csv', index=False, header=True)


In [313]:
# read the processed data
df_new = pd.read_csv('processed_data.csv')

# 2. Model Training:
*a. Use the processed data to train a simple recommendation model, based on
the candidate's preference.
b. The model should take a user's query and output the top 3 product
recommendations based on similarity to product descriptions.*

In [314]:
# import libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')


In [315]:
# read the processed data
df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0,Product ID,Product Name,Description,Category,Price,Tokenized_Description
0,1,Smartphone A,"Sleek design, 64GB storage, 12MP camera",Electronics,699.99,"['sleek', 'design', '64gb', 'storage', '12mp',..."
1,2,Laptop B,"15.6 inch, 8GB RAM, 256GB SSD",Electronics,999.99,"['15.6', 'inch', '8gb', 'ram', '256gb', 'ssd']"
2,3,Casual Shoes C,"Leather, size 10, brown",Fashion,79.99,"['leather', 'size', '10', 'brown']"
3,4,Travel Mug D,"Stainless steel, 500ml",Home & Kitchen,14.99,"['stainless', 'steel', '500ml']"
4,5,Eau De Parfum E,"Floral scent, 100ml",Beauty,49.99,"['floral', 'scent', '100ml']"


In [316]:
# create a tfidf vectorizer object
tfidf = TfidfVectorizer()


In [317]:
# fit the vectorizer object on the tokenized product descriptions
tfidf.fit(df['Tokenized_Description'])

In [318]:
# transform the tokenized product descriptions
tfidf_matrix = tfidf.transform(df['Tokenized_Description'])


In [319]:
# compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [320]:
# save the cosine similarity matrix to a csv file
pd.DataFrame(cosine_sim).to_csv('cosine_sim.csv', index=False, header=False)

In [321]:
# read the cosine similarity matrix
cosine_sim = pd.read_csv('cosine_sim.csv', header=None)

In [322]:
# create a series of product names
product_names = pd.Series(df['Product Name'])

#create a series of product descriptions
product_descriptions = pd.Series(df['Description'])
# price
product_price = pd.Series(df['Price'])

In [323]:
# function to recommend products based on user query
def recommend_products(query):
    # if the query is not in the product names, return an error message
    # tokenize the user query
    query = word_tokenize(query)
    # transform the tokenized query
    query = tfidf.transform(query)
    # compute the cosine similarity between the user query and all product descriptions
    similarity_scores = cosine_similarity(query, tfidf_matrix)
    # get the indices of the top 3 most similar products
    indices = similarity_scores.argsort()[0][-3:]
    # get the product names corresponding to the indices
    product_names = df['Product Name'].iloc[indices]
    # get the product descriptions corresponding to the indices
    product_descriptions = df['Description'].iloc[indices]
    # get the product prices corresponding to the indices
    product_prices = df['Price'].iloc[indices]
    # return the product names
    return product_names, product_descriptions, product_prices

In [324]:
# test the function
recommend_products('leather')

(9      Wrist Watch J
 2     Casual Shoes C
 28        Sandals H1
 Name: Product Name, dtype: object,
 9     Analog, leather strap, water-resistant
 2                    Leather, size 10, brown
 28                         Leather, size 7, 
 Name: Description, dtype: object,
 9     199.99
 2      79.99
 28     49.99
 Name: Price, dtype: float64)

In [325]:
recommend_products('Breathable') 

(30            Pet Food K1
 31    Portable Charger L1
 10     Running Sneakers K
 Name: Product Name, dtype: object,
 30        For adult dogs, chicken flavor
 31                 10000mAh, 2 USB ports
 10    Breathable material, size 9, white
 Name: Description, dtype: object,
 30     54.99
 31     24.99
 10    109.99
 Name: Price, dtype: float64)