In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

## Dependencies

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pymongo
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from config import mongo_conn

## Load Perfume Data

In [2]:
# Initialize PyMongo to work with MongoDBs
client = pymongo.MongoClient(mongo_conn)
db = client.perfume_db
perfume_df = pd.DataFrame(list(db.perfume_data2.find()))
perfume_df.head()

Unnamed: 0,_id,name,company,image,for_gender,rating,main accords,description,top notes,middle notes,base notes,longevity,sillage,gender_vote,price value
0,603a88def6b1c8369675d848,Les Exclusifs de Chanel Bel Respiro,Chanel,https://fimgs.net/mdimg/perfume/375x500.5298.jpg,for women,4.0,"{'citrus': 100, 'white floral': 81.3333, 'wood...",Eau perlee by Jardin de France is a Floral Fru...,"[Mandarin Orange, Petitgrain, Violet, Amalfi L...","[Jasmine, Woodsy Notes, African Orange flower,...","[Musk, Patchouli]","{'very weak': 0, 'weak': 0, 'moderate': 0, 'lo...","{'intimate': 0, 'moderate': 0, 'strong': 0, 'e...","{'female': 0, 'more female': 0, 'unisex': 0, '...","{'way overpriced': 0, 'overpriced': 0, 'ok': 0..."
1,603a88def6b1c8369675d849,Les Exclusifs de Chanel Bel Respiro,Chanel,https://fimgs.net/mdimg/perfume/375x500.7913.jpg,for men,4.22,"{'leather': 100, 'woody': 81.6293, 'animalic':...",Knize Ten by Knize is a Leather fragrance for ...,"[Geranium, Rosemary, Bergamot, Orange, Petitgr...","[Carnation, Patchouli, Sandalwood, Orris, tinc...","[Leather, Castoreum, oak moss, Amber, Musk, Va...","{'very weak': 19, 'weak': 16, 'moderate': 53, ...","{'intimate': 37, 'moderate': 138, 'strong': 15...","{'female': 0, 'more female': 0, 'unisex': 6, '...","{'way overpriced': 0, 'overpriced': 2, 'ok': 1..."
2,603a88def6b1c8369675d84a,Les Exclusifs de Chanel Bel Respiro,Chanel,https://fimgs.net/mdimg/perfume/375x500.5521.jpg,for men,4.44,"{'aromatic': 100, 'warm spicy': 86.2226, 'lave...",La Nuit de l'Homme by Yves Saint Laurent is a ...,[Cardamom],"[Lavender, Virginia Cedar, Bergamot]","[Vetiver, Caraway]","{'very weak': 355, 'weak': 516, 'moderate': 20...","{'intimate': 952, 'moderate': 3000, 'strong': ...","{'female': 7, 'more female': 4, 'unisex': 63, ...","{'way overpriced': 45, 'overpriced': 112, 'ok'..."
3,603a88def6b1c8369675d84b,Les Exclusifs de Chanel Bel Respiro,Chanel,https://fimgs.net/mdimg/perfume/375x500.25324.jpg,for women,3.93,"{'vanilla': 100, 'coffee': 88.2484, 'sweet': 7...",Black Opium by Yves Saint Laurent is a Orienta...,"[Pear, Pink Pepper, Orange Blossom]","[Coffee, Jasmine, Bitter Almond, Licorice]","[Vanilla, Patchouli, Cedar, Cashmere Wood]","{'very weak': 270, 'weak': 371, 'moderate': 14...","{'intimate': 594, 'moderate': 1973, 'strong': ...","{'female': 554, 'more female': 100, 'unisex': ...","{'way overpriced': 49, 'overpriced': 199, 'ok'..."
4,603a88def6b1c8369675d84c,Les Exclusifs de Chanel Bel Respiro,Chanel,https://fimgs.net/mdimg/perfume/375x500.31861.jpg,for men,3.89,"{'fresh spicy': 100, 'amber': 75.7735, 'citrus...",Sauvage by Christian Dior is a Aromatic Fouge...,"[Calabrian bergamot, Pepper]","[Sichuan Pepper, Lavender, Pink Pepper, Vetive...","[Ambroxan, Cedar, Labdanum]","{'very weak': 232, 'weak': 213, 'moderate': 95...","{'intimate': 466, 'moderate': 2075, 'strong': ...","{'female': 7, 'more female': 1, 'unisex': 26, ...","{'way overpriced': 75, 'overpriced': 210, 'ok'..."


## Preprocess Data

### Weighted Rating

In [4]:
# Find the weighted rating taking into consideration both mean rating and number of reviews
# Weighted Rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C  where
# v is the number of votes 
# m is the minimum number of votes required (the prerequisite) 
# R is the mean rating
# C is the mean rating of all the perfumes in the dataset

C = perfume_df["rating"].mean()

# value of m is the number of votes garnered by the 80th percentile perfume
# m = perfume_df['votes'].quantile(0.80)
m = 56

# Function to compute the weighted rating for each perfume
def weighted_rating(x, m=m, C=C):
#     v = x['votes']
    v = x['rating']
    R = x['rating']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

In [5]:
# Compute the weighted_rating using the weighted_rating function defined above
perfume_df["weighted_rating"] = perfume_df.apply(weighted_rating, axis=1)
perfume_df["weighted_rating"].head()


0    4.079333
1    4.094460
2    4.111079
3    4.074836
4    4.072334
Name: weighted_rating, dtype: float64

In [7]:
# perfume_df[["name", "rating", "votes", "weighted_rating"]].sort_values(by=['weighted_rating'], ascending=False)
perfume_df[["name", "rating", "weighted_rating"]].sort_values(by=['weighted_rating'], ascending=False)

Unnamed: 0,name,rating,weighted_rating
8,Les Exclusifs de Chanel Bel Respiro,4.47,4.11346
2,Les Exclusifs de Chanel Bel Respiro,4.44,4.111079
1,Les Exclusifs de Chanel Bel Respiro,4.22,4.09446
9,Les Exclusifs de Chanel Bel Respiro,4.13,4.088091
5,Les Exclusifs de Chanel Bel Respiro,4.02,4.080646
0,Les Exclusifs de Chanel Bel Respiro,4.0,4.079333
7,Les Exclusifs de Chanel Bel Respiro,3.98,4.078033
3,Les Exclusifs de Chanel Bel Respiro,3.93,4.074836
4,Les Exclusifs de Chanel Bel Respiro,3.89,4.072334
6,Les Exclusifs de Chanel Bel Respiro,3.77,4.065131


### MultiLabelBinarizer

In [8]:
# convert Notes list to multiple features columns
mlb = MultiLabelBinarizer()

# Top Notes
X_top_notes = mlb.fit_transform(perfume_df["top notes"])
column_names = ["top_note_" + note for note in mlb.classes_]
perfume_df = perfume_df.join(pd.DataFrame(X_top_notes, columns=column_names))

# Middle Notes
X_middle_notes = mlb.fit_transform(perfume_df["middle notes"])
column_names = ["middle_note_" + note for note in mlb.classes_]
perfume_df = perfume_df.join(pd.DataFrame(X_middle_notes, columns=column_names))

# Base notes
X_base_notes = mlb.fit_transform(perfume_df["base notes"])
column_names = ["base_note_" + note for note in mlb.classes_]
perfume_df = perfume_df.join(pd.DataFrame(X_base_notes, columns=column_names))

perfume_df.columns

['top_note_Amalfi Lemon', 'top_note_Apple', 'top_note_Bergamot', 'top_note_Black Currant', 'top_note_Calabrian bergamot', 'top_note_Cardamom', 'top_note_Geranium', 'top_note_Grapefruit', 'top_note_Green Apple', 'top_note_Lavender', 'top_note_Lemon', 'top_note_Mandarin Orange', 'top_note_Mint', 'top_note_Orange', 'top_note_Orange Blossom', 'top_note_Pear', 'top_note_Pepper', 'top_note_Petitgrain', 'top_note_Pink Pepper', 'top_note_Rosemary', 'top_note_Violet']
['Amalfi Lemon' 'Apple' 'Bergamot' 'Black Currant' 'Calabrian bergamot'
 'Cardamom' 'Geranium' 'Grapefruit' 'Green Apple' 'Lavender' 'Lemon'
 'Mandarin Orange' 'Mint' 'Orange' 'Orange Blossom' 'Pear' 'Pepper'
 'Petitgrain' 'Pink Pepper' 'Rosemary' 'Violet']


Index(['_id', 'name', 'company', 'image', 'for_gender', 'rating',
       'main accords', 'description', 'top notes', 'middle notes',
       'base notes', 'longevity', 'sillage', 'gender_vote', 'price value',
       'weighted_rating', 'top_note_Amalfi Lemon', 'top_note_Apple',
       'top_note_Bergamot', 'top_note_Black Currant',
       'top_note_Calabrian bergamot', 'top_note_Cardamom', 'top_note_Geranium',
       'top_note_Grapefruit', 'top_note_Green Apple', 'top_note_Lavender',
       'top_note_Lemon', 'top_note_Mandarin Orange', 'top_note_Mint',
       'top_note_Orange', 'top_note_Orange Blossom', 'top_note_Pear',
       'top_note_Pepper', 'top_note_Petitgrain', 'top_note_Pink Pepper',
       'top_note_Rosemary', 'top_note_Violet',
       'middle_note_African Orange flower', 'middle_note_Ambroxan',
       'middle_note_Bergamot', 'middle_note_Bitter Almond',
       'middle_note_Caramel', 'middle_note_Carnation', 'middle_note_Cinnamon',
       'middle_note_Coffee', 'middle_note_Geran

In [None]:
type(X_top_notes)

### DictVectorizer

In [None]:
# convert Accords dict to multiple features columns
dv = DictVectorizer(sparse=False)
X_accords = dv.fit_transform([perfume_df.main_accords])
X_accords
# dv.get_feature_names()
# perfume_df = perfume_df.join(pd.DataFrame(X_accords, columns=dv.fearue))
# perfume_df.columns


### X (data) and y (target)

In [None]:
# drop unnecessary columns
original_X = perfume_df.drop(["_id", "brand", "title", "date", "accords", "rating_score", "votes", "gender", "notes_3", "notes_4", "notes_5", "notes_6", "notes_7", "notes_8", "notes_9", "notes_10", "notes_11", "notes_12", "notes_13", "notes_14", "notes_15", "notes_16", "notes_17", "notes_18", "notes_19", "notes_10"], axis=1)
y = perfume_df["rating_score"].values.reshape(-1, 1)
print(X.shape, y.shape)

### Dummy Encoding (Binary Encoded Data)

In [None]:
data = original_X.copy()
X = pd.get_dummies(data)
X.head()

## Train and Test data

In [None]:
# Split the data into training and testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X, y, random_state=42)

In [None]:
# Create a StandardScater model and fit it to the training data

# X_scaler = StandardScaler().fit(X_train)
# y_scaler = StandardScaler().fit(y_train)

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)
# y_train_scaled = y_scaler.transform(y_train)
# y_test_scaled = y_scaler.transform(y_test)

## Linear Regression Model

In [None]:
# Create a LinearRegression model and fit it to the scaled training data

model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)


### Mean Squared Error and R2

In [None]:
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2
predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Lasso Model

In [None]:
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Ridge Model

In [None]:
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### ElasticNet model

In [None]:
elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Save the Model