In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

## Dependencies

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pymongo
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from config import mongo_conn

## Load Perfume Data

In [2]:
# Initialize PyMongo to work with MongoDBs
client = pymongo.MongoClient(mongo_conn)
db = client.perfume_db
perfume_df = pd.DataFrame(list(db.perfume_data.find()))
perfume_df.head()

Unnamed: 0,_id,brand,title,date,accords,rating_score,votes,longevity_poor,longevity_weak,longevity_moderate,...,notes_12,notes_13,notes_14,notes_15,notes_16,notes_17,notes_18,notes_19,gender,notes_20
0,603987b919085a1bd5bffcbe,The-Spirit-of-Dubai,Aamal The Spirit of Dubai for women and men,2017-01-01 00:00:00,"woody,earthy,animalic,amber,musky,balsamic",5.0,3,0,0,0,...,Base3Moss,Base4Agarwood (Oud),Base5Indian Oud,,,,,,women,
1,603987b919085a1bd5bffcbf,Ajmal,Aatifa Ajmal for women and men,2014-01-01 00:00:00,"fresh spicy,woody,musky,rose,amber",4.2,10,1,0,0,...,,,,,,,,,women,
2,603987b919085a1bd5bffcc0,Al-Jazeera-Perfumes,AA Al-Jazeera Perfumes for women and men,2000-01-01 05:00:00,"rose,woody,musky,oud,fruity",0.0,0,0,0,0,...,,,,,,,,,women,
3,603987b919085a1bd5bffcc1,Art-of-Scent-Swiss-Perfumes,aarewasser Art of Scent - Swiss Perfumes for w...,2010-01-01 00:00:00,"white floral,green,ozonic,fresh,animalic",0.0,1,0,0,0,...,,,,,,,,,women,
4,603987b919085a1bd5bffcc2,Hamidi-Oud-Perfumes,Aaliyah Hamidi Oud & Perfumes for women and men,2000-01-01 05:00:00,"woody,warm spicy,amber,balsamic,musky",0.0,2,0,0,0,...,,,,,,,,,women,


## Preprocess Data

### Weighted Rating

In [4]:
# Find the weighted rating taking into consideration both mean rating and number of reviews
# Weighted Rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C  where
# v is the number of votes 
# m is the minimum number of votes required (the prerequisite) 
# R is the mean rating
# C is the mean rating of all the perfumes in the dataset

C = perfume_df["rating_score"].mean()

# value of m is the number of votes garnered by the 80th percentile perfume
m = perfume_df['votes'].quantile(0.80)

# Function to compute the weighted rating for each perfume
def weighted_rating(x, m=m, C=C):
    v = x['votes']
    R = x['rating_score']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

In [5]:
# Compute the weighted_rating using the weighted_rating function defined above
perfume_df["weighted_rating"] = perfume_df.apply(weighted_rating, axis=1)
perfume_df["weighted_rating"].head()


0    3.277110
1    3.334296
2    3.192377
3    3.140887
4    3.091032
Name: weighted_rating, dtype: float64

In [12]:
perfume_df[["brand", "rating_score", "votes", "weighted_rating"]].sort_values(by=['weighted_rating'], ascending=False)

Unnamed: 0,brand,rating_score,votes,weighted_rating
30288,Christian-Dior,4.52,4940,4.503806
31194,Dolce-Gabbana,4.52,2637,4.489983
30956,Guerlain,4.52,1432,4.465757
8145,Chanel,4.54,930,4.457048
15929,Chanel,4.48,2277,4.446405
...,...,...,...,...
39153,Fragrance-One,2.37,464,2.465552
13001,Exceptional-Parfums,2.19,327,2.347590
25507,Clean,1.94,130,2.339974
27649,Trump,1.98,418,2.134395


### MultiLabelBinarizer

In [None]:
# convert Notes list to multiple features columns
mlb = MultiLabelBinarizer()
X_notes = mlb.fit_transform(perfume_df.notes)
perfume_df = perfume_df.join(pd.DataFrame(X_notes, columns=mlb.classes_))
perfume_df.columns

### DictVectorizer

In [None]:
# convert Accords dict to multiple features columns
dv = DictVectorizer()
X_accords = dv.fit_transform(perfume_df.accords)
perfume_df = perfume_df.join(pd.DataFrame(X_accords, columns=dv.classes_))
perfume_df.columns

### X (data) and y (target)

In [None]:
# drop unnecessary columns
original_X = perfume_df.drop(["_id", "brand", "title", "date", "accords", "rating_score", "votes", "gender", "notes_3", "notes_4", "notes_5", "notes_6", "notes_7", "notes_8", "notes_9", "notes_10", "notes_11", "notes_12", "notes_13", "notes_14", "notes_15", "notes_16", "notes_17", "notes_18", "notes_19", "notes_10"], axis=1)
y = perfume_df["rating_score"].values.reshape(-1, 1)
print(X.shape, y.shape)

### Dummy Encoding (Binary Encoded Data)

In [None]:
data = original_X.copy()
X = pd.get_dummies(data)
X.head()

## Train and Test data

In [None]:
# Split the data into training and testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X, y, random_state=42)

In [None]:
# Create a StandardScater model and fit it to the training data

# X_scaler = StandardScaler().fit(X_train)
# y_scaler = StandardScaler().fit(y_train)

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)
# y_train_scaled = y_scaler.transform(y_train)
# y_test_scaled = y_scaler.transform(y_test)

## Linear Regression Model

In [None]:
# Create a LinearRegression model and fit it to the scaled training data

model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)


### Mean Squared Error and R2

In [None]:
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2
predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Lasso Model

In [None]:
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Ridge Model

In [None]:
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### ElasticNet model

In [None]:
elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Save the Model