In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

## Dependencies

In [42]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pymongo
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from config import mongo_conn

## Load Perfume Data

In [32]:
# Initialize PyMongo to work with MongoDBs
client = pymongo.MongoClient(mongo_conn)
db = client.perfume_db
perfume_df = pd.DataFrame(list(db.perfume_data.find()))
perfume_df.head()

Unnamed: 0,_id,brand,title,date,accords,rating_score,votes,longevity_poor,longevity_weak,longevity_moderate,...,notes_12,notes_13,notes_14,notes_15,notes_16,notes_17,notes_18,notes_19,notes_20,gender
0,603987b919085a1bd5bffcbe,The-Spirit-of-Dubai,Aamal The Spirit of Dubai for women and men,2017-01-01 00:00:00,"woody,earthy,animalic,amber,musky,balsamic",5.0,3,0,0,0,...,Base3Moss,Base4Agarwood (Oud),Base5Indian Oud,,,,,,,women
1,603987b919085a1bd5bffcbf,Ajmal,Aatifa Ajmal for women and men,2014-01-01 00:00:00,"fresh spicy,woody,musky,rose,amber",4.2,10,1,0,0,...,,,,,,,,,,women
2,603987b919085a1bd5bffcc0,Al-Jazeera-Perfumes,AA Al-Jazeera Perfumes for women and men,2000-01-01 05:00:00,"rose,woody,musky,oud,fruity",0.0,0,0,0,0,...,,,,,,,,,,women
3,603987b919085a1bd5bffcc1,Art-of-Scent-Swiss-Perfumes,aarewasser Art of Scent - Swiss Perfumes for w...,2010-01-01 00:00:00,"white floral,green,ozonic,fresh,animalic",0.0,1,0,0,0,...,,,,,,,,,,women
4,603987b919085a1bd5bffcc2,Hamidi-Oud-Perfumes,Aaliyah Hamidi Oud & Perfumes for women and men,2000-01-01 05:00:00,"woody,warm spicy,amber,balsamic,musky",0.0,2,0,0,0,...,,,,,,,,,,women


### Preprocess Data

In [41]:
# perfume_df['accords'] = perfume_df['accords'].tolist()
# perfume_df.head()
# df2 = pd.DataFrame(perfume_df.accords.values.tolist())
# df2 = perfume_df.accords.apply(pd.Series)
# df2


In [52]:
# Assign X (data) and y (target)
original_X = perfume_df.drop(["_id", "brand", "title", "date", "accords", "rating_score", "votes", "gender"], axis=1)
y = perfume_df["rating_score"].values.reshape(-1, 1)
print(X.shape, y.shape)

(51212, 34644) (51212, 1)


### Dummy Encoding (Binary Encoded Data)

In [53]:
data = original_X.copy()
X = pd.get_dummies(data)
X.head()

Unnamed: 0,longevity_poor,longevity_weak,longevity_moderate,longevity_long,longevity_very_long,sillage_soft,sillage_moderate,sillage_heavy,sillage_enormous,notes_1_Middle0Acácia,...,notes_20_Top20Violet,notes_20_Top20White Musk,notes_20_Top20Yuzu,notes_20_Top20iris,notes_20_Top20resins,notes_20_Top20white honey,gender_,gender_man,gender_unisex,gender_women
0,0,0,0,0,3,0,1,0,3,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,5,0,0,3,3,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,2,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [55]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [56]:
# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [57]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
# Create a LinearRegression model and fit it to the scaled training data

model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)


In [None]:
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# LASSO model
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# Ridge model
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# ElasticNet model
elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")