In [17]:
import pandas as pd
import ast

In [18]:
df = pd.read_csv("perfume_data_final.csv")

In [19]:
df.shape

(478, 13)

## Notes Preprocessing

In [20]:
def convert_to_list(string):
    try:
        return ast.literal_eval(string)
    except (ValueError, SyntaxError):
        return []

In [21]:
# Convert the note columns to lists
df['top notes'] = df['top notes'].apply(convert_to_list)
df['middle notes'] = df['middle notes'].apply(convert_to_list)
df['base notes'] = df['base notes'].apply(convert_to_list)


In [22]:
# Get a list of all unique notes
all_notes = set()
for column in ['top notes', 'middle notes', 'base notes']:
    for notes_list in df[column]:
        all_notes.update(notes_list)
        

In [23]:
# Create a column for each unique note and fill with 0
note_columns = pd.DataFrame(0, index=df.index, columns=list(all_notes))
print(note_columns.shape)

(478, 491)


In [25]:
# Fill the columns with 1 where the note is present
for column in ['top notes', 'middle notes', 'base notes']:
    for i, notes_list in enumerate(df[column]):
        for note in notes_list:
            note_columns.at[i, note] = 1

In [26]:
# Concatenate the original DataFrame with the new one-hot encoded columns
df_combined = pd.concat([df.drop(columns=['top notes', 'middle notes', 'base notes']), note_columns], axis=1)

In [27]:
df_combined.head()

Unnamed: 0,name,company,for_gender,rating,number_votes,main accords,longevity,sillage,gender_vote,price value,...,Narcissus,Cinnamon,French labdanum,Tea,Verbena,Java vetiver oil,Oakmoss,Lemongrass,Myrhh,Fig Nectar
0,Angels' Share,By Kilian,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15...","{'female': 40, 'more female': 39, 'unisex': 22...","{'way overpriced': 64, 'overpriced': 143, 'ok'...",...,0,1,0,0,0,0,0,0,0,0
1,My Way,Giorgio Armani,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2...","{'female': 349, 'more female': 21, 'unisex': 4...","{'way overpriced': 38, 'overpriced': 121, 'ok'...",...,0,0,0,0,0,0,0,0,0,0
2,Libre Intense,Yves Saint Laurent,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23...","{'female': 162, 'more female': 91, 'unisex': 7...","{'way overpriced': 11, 'overpriced': 59, 'ok':...",...,0,0,0,0,0,0,0,0,0,0
3,Dior Homme 2020,Christian Dior,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1...","{'female': 3, 'more female': 1, 'unisex': 17, ...","{'way overpriced': 31, 'overpriced': 59, 'ok':...",...,0,0,0,0,0,0,0,0,0,0
4,Acqua di Giò Profondo,Giorgio Armani,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1...","{'female': 3, 'more female': 0, 'unisex': 5, '...","{'way overpriced': 32, 'overpriced': 84, 'ok':...",...,0,0,0,0,0,0,0,0,0,0
