In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
#read the file
file_path = Path("../Resources/beauty_habits_clean.csv")
df=pd.read_csv(file_path, usecols=["id", "brand", "name", "category", "rating", "number_of_reviews", "love", "price", "online_only", "exclusive", "limited_edition", "good_rating", "product_type"])
df

Unnamed: 0,id,brand,category,name,rating,number_of_reviews,love,price,online_only,exclusive,limited_edition,product_type,good_rating
0,2218774,Acqua Di Parma,Fragrance,Blu Mediterraneo MINIATURE Set,4.0,4,3002,66.0,1,0,0,Fragrance,True
1,2044816,Acqua Di Parma,Cologne,Colonia,4.5,76,2700,66.0,1,0,0,Fragrance,True
2,1417567,Acqua Di Parma,Perfume,Arancia di Capri,4.5,26,2600,180.0,1,0,0,Fragrance,True
3,1417617,Acqua Di Parma,Perfume,Mirto di Panarea,4.5,23,2900,120.0,1,0,0,Fragrance,True
4,2218766,Acqua Di Parma,Fragrance,Colonia Miniature Set,3.5,2,943,72.0,1,0,0,Fragrance,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9163,2208502,SEPHORA COLLECTION,Face Masks,The Rose Gold Mask,2.0,15,6200,6.0,0,1,1,Treatment,False
9164,2298909,SEPHORA COLLECTION,Lip Sets,Give Me Some Sugar Colorful Gloss Balm Set,0.0,0,266,15.0,0,1,0,Colour,False
9165,2236750,SEPHORA COLLECTION,Tinted Moisturizer,Weekend Warrior Tone Up Cream,0.0,0,445,16.0,0,1,0,Treatment,False
9166,50,SEPHORA COLLECTION,no category,Gift Card,5.0,46,0,50.0,0,0,0,no category,True


In [3]:
#confirm the dtypes
df.dtypes

id                     int64
brand                 object
category              object
name                  object
rating               float64
number_of_reviews      int64
love                   int64
price                float64
online_only            int64
exclusive              int64
limited_edition        int64
product_type          object
good_rating             bool
dtype: object

In [4]:
#confirm the shape and stats
print(df.shape)
df.describe()

(9168, 13)


Unnamed: 0,id,rating,number_of_reviews,love,price,online_only,exclusive,limited_edition
count,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0
mean,1962952.0,3.99002,282.13918,16278.59,50.063237,0.234839,0.264725,0.091841
std,385971.4,1.007707,890.642028,42606.51,47.164989,0.423921,0.441211,0.288817
min,50.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,1819453.0,4.0,10.0,1600.0,24.0,0.0,0.0,0.0
50%,2072354.0,4.0,46.0,4800.0,35.0,0.0,0.0,0.0
75%,2230591.0,4.5,210.0,13800.0,59.0,0.0,1.0,0.0
max,2359685.0,5.0,19000.0,1300000.0,549.0,1.0,1.0,1.0


In [5]:
#apply a label encoder to the "good rating" column to make it numerical True==1
le = LabelEncoder()
df['good_rating'] = le.fit_transform(df['good_rating'])
df["product_type"]=  le.fit_transform(df['product_type'])

In [6]:
#Create X
X = df.copy()
X

Unnamed: 0,id,brand,category,name,rating,number_of_reviews,love,price,online_only,exclusive,limited_edition,product_type,good_rating
0,2218774,Acqua Di Parma,Fragrance,Blu Mediterraneo MINIATURE Set,4.0,4,3002,66.0,1,0,0,1,1
1,2044816,Acqua Di Parma,Cologne,Colonia,4.5,76,2700,66.0,1,0,0,1,1
2,1417567,Acqua Di Parma,Perfume,Arancia di Capri,4.5,26,2600,180.0,1,0,0,1,1
3,1417617,Acqua Di Parma,Perfume,Mirto di Panarea,4.5,23,2900,120.0,1,0,0,1,1
4,2218766,Acqua Di Parma,Fragrance,Colonia Miniature Set,3.5,2,943,72.0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9163,2208502,SEPHORA COLLECTION,Face Masks,The Rose Gold Mask,2.0,15,6200,6.0,0,1,1,3,0
9164,2298909,SEPHORA COLLECTION,Lip Sets,Give Me Some Sugar Colorful Gloss Balm Set,0.0,0,266,15.0,0,1,0,0,0
9165,2236750,SEPHORA COLLECTION,Tinted Moisturizer,Weekend Warrior Tone Up Cream,0.0,0,445,16.0,0,1,0,3,0
9166,50,SEPHORA COLLECTION,no category,Gift Card,5.0,46,0,50.0,0,0,0,4,1


In [7]:
# Define the features set.
X = X.drop(["id","category", "brand","name", "rating", "good_rating"], axis=1)
X.head()

Unnamed: 0,number_of_reviews,love,price,online_only,exclusive,limited_edition,product_type
0,4,3002,66.0,1,0,0,1
1,76,2700,66.0,1,0,0,1
2,26,2600,180.0,1,0,0,1
3,23,2900,120.0,1,0,0,1
4,2,943,72.0,1,0,0,1


In [8]:
# Define the target set.
y = df["good_rating"].ravel()
y[:5]

array([1, 1, 1, 1, 0])

In [9]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [10]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [11]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) 


In [12]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 1, 1, ..., 1, 0, 1])

In [14]:
#calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

#Create a DataFrame from the confusion matrix
cm_df= pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,124,338
Actual 1,51,1779


In [15]:
#Calculating the accuracy score
acc_score=accuracy_score(y_test, predictions)

In [16]:
#Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,124,338
Actual 1,51,1779


Accuracy Score : 0.8302792321116929
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.27      0.39       462
           1       0.84      0.97      0.90      1830

    accuracy                           0.83      2292
   macro avg       0.77      0.62      0.65      2292
weighted avg       0.81      0.83      0.80      2292



In [17]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.37649842, 0.31040092, 0.22908968, 0.01421316, 0.01695169,
       0.0111554 , 0.04169072])

In [18]:
#Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3764984191742538, 'number_of_reviews'),
 (0.31040092495047267, 'love'),
 (0.2290896794601057, 'price'),
 (0.041690721100659855, 'product_type'),
 (0.016951693146681688, 'exclusive'),
 (0.014213158188177769, 'online_only'),
 (0.011155403979648431, 'limited_edition')]