<table align="center" width=100%>
    <tr>
        <td>
            <div align="left">
                <font color="#21619" size=5px>                   
                    <b>  Content
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

* ##### [1. Install Packages](#packages)
* ##### [2. Import Libraries](#libraries)
* ##### [3. Import Dataset](#data)
* ##### [4. Instantiate Tokenizer and Model](#tokenizer)
* ##### [5. Step by Step process of vectorizing product ingredients](#similarity_process)
  * ##### [5.1 Step 1: Find Compound Number](#step1)
  * ##### [5.1 Step 2: Convert Chemical into Molecular Structure](#step2)
  * ##### [5.1 Step 3: Convert Molecular Structure into Tokens](#step3)
  * ##### [5.1 Step 4: Convert Tokens into Vectors](#step4)
  * ##### [5.1 Step 1: Find Compound Number](#step5)  
* ##### [6. Create Embeddings for Products](#embeddings)
* ##### [7. Similarity Matrix](#matrix)

<a id="packages"></a>
<table align="center" width=100%>
    <tr>
        <td>
            <div align="left">
                <font color="#21619" size=5px>                   
                    <b>  Install Packages
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
!pip install transformers
!pip install pubchempy

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:0

<a id="libraries"></a>
<table align="center" width=100%>
    <tr>
        <td>
            <div align="left">
                <font color="#21619" size=5px>                   
                    <b>  Import Libraries
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Import pandas for importing , retrieving and manipulating data from datasets in the form of table
import pandas as pd

# Import Pubchempy for converting chemical names into its molecular structure
import pubchempy as pcp

# Import Autotokenizer for tokenizing the molecular structure of chemicals into integers
# Import Automodel for embedding tokenized chemicals into vectors
from transformers import AutoTokenizer, AutoModel

# Import torch to perform operations with tensor
import torch

# Import cosine to find similarity among products using the vectors
from scipy.spatial.distance import cosine

<a id="data"></a>
<table align="center" width=100%>
    <tr>
        <td>
            <div align="left">
                <font color="#21619" size=5px>                   
                    <b>  Import Dataset
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Import the products data into a dataframe
products = pd.read_csv("product_info.csv")

<a id="tokenizer"></a>
<table align="center" width=100%>
    <tr>
        <td>
            <div align="left">
                <font color="#21619" size=5px>                   
                    <b>  Instantiate Tokenizer and Model
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Create a tokenizer which converts chemicals into integers using a pretrained chemical tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# Create a model which embeds tokenized chemicals into vectors using a pretrained chemical model
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

<a id="similarity_process"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=5px>                   
                    <b>  Step by Step process of vectorizing product ingredients
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

<a id="step1"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=4px>                   
                    <b>  Step 1: Find Compound Number
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Find the compound number of the chemical
compound = pcp.get_compounds("Methyl paraben", 'name')[0]
compound

Compound(7456)

<a id="step2"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=4px>                   
                    <b>  Step 2: Convert Chemical into Molecular Structure
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Convert the chemical into its molecular structure
molecular_structure = compound.isomeric_smiles
molecular_structure

'COC(=O)C1=CC=C(C=C1)O'

<a id="step3"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=4px>                   
                    <b>  Step 3: Convert Molecular Structure into Tokens
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Convert molecular structure into tokens
token = tokenizer.encode(molecular_structure, add_special_tokens=True)
token

[0, 307, 263, 51, 13, 39, 21, 33, 262, 33, 39, 12, 39, 33, 39, 21, 13, 51, 2]

<a id="step4"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=4px>                   
                    <b>  Step 4: Convert Tokens into Vectors
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Convert tokens into vectors
vector = model(torch.tensor(token).unsqueeze(0))[0].mean(dim=1).squeeze().detach().numpy()
vector

array([-4.03191820e-02,  3.00257146e-01,  5.16170152e-02, -5.69022894e-01,
        5.42378247e-01, -4.39939708e-01, -3.37140471e-01, -2.16758102e-01,
       -4.74140882e-01, -2.34592438e-01,  7.51346469e-01, -7.21412718e-01,
        1.53637454e-01, -3.58899057e-01, -1.08973050e+00,  6.26484275e-01,
        5.45496106e-01,  6.62690520e-01,  6.62351489e-01,  1.38471246e-01,
       -2.58330911e-01, -1.59040451e-01,  8.99555385e-02, -1.79205224e-01,
       -7.14247227e-01,  8.85492742e-01,  3.39056314e-05,  9.09335837e-02,
        6.14295900e-02, -8.71350825e-01,  6.00385725e-01, -8.88444856e-02,
        5.35486937e-01, -1.20367542e-01, -3.09987277e-01, -1.51702881e-01,
       -6.88020647e-01, -6.21925592e-01, -6.12605929e-01,  1.48194432e-01,
       -4.62620497e-01, -2.73222208e-01,  5.80070436e-01,  3.61965895e-01,
       -4.68579859e-01,  5.86708546e-01,  6.63083255e-01, -5.31439602e-01,
        4.29931998e-01,  5.99781632e-01, -5.61444126e-02, -3.43613029e-01,
        8.94649208e-01, -

In [None]:
# Sample ingredients of a product
products.ingredients.iloc[1]

"['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Ethylhexyl Methoxycinnamate, Ethylhexyl Salicylate, Butyl Methoxydibenzoylmethane, Benzyl Alcohol, Benzyl Benzoate, Benzyl Cinnamate, Cinnamal, Citral, Coumarin Limonene, Eugenol, Alpha-Isomethyl Ionone, Linalool.']"

In [None]:
from tqdm.notebook import tqdm

In [None]:
products[["product_name","ingredients"]][products.ingredients.notnull()]

product_name    0
ingredients     0
dtype: int64

<a id="embeddings"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=5px>                   
                    <b>  Create Embeddings for products
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Create a dataframe with product names and their respective ingredients
df = products[["product_name","ingredients"]][products.ingredients.notnull()]

# Create list to store vectors of all products
all_embeddings = []

# For loop to iterate over ingredients of each product
for ingredients in tqdm(df['ingredients']):

    # Create a list to store molecular structures of chemicals present in each product
    smiles = []

    # Iterate over each chemical of a product ingredients
    for name in str(ingredients).split(","):
        try:
            # Get compound number of chemical
            compound = pcp.get_compounds(name, 'name')[0]
            # Get molecular structure of a compound
            smiles.append(compound.isomeric_smiles)
        except:
            continue

    # Tokenize chemicals
    tokens = [tokenizer.encode(smile, add_special_tokens=True) for smile in smiles]
    # Embed chemicals
    embeddings = [model(torch.tensor(token).unsqueeze(0))[0].mean(dim=1).squeeze().detach().numpy() for token in tokens]
    all_embeddings.append(embeddings)

  0%|          | 0/7549 [00:00<?, ?it/s]

<a id="matrix"><a>
<table align="center" width=100%>
   <tr>
       <td>
           <div align="left">
               <font color="#21619" size=5px>                   
                    <b>  Similarity Matrix
                    </b>
                </font>
            </div>
        </td>
    </tr>
</table>

In [None]:
# Create similarity matrix for the products using cosine similarity
similarity_matrix = [[1 - cosine(embeddings1[0], embeddings2[0]) if len(embeddings1) > 0 and len(embeddings2) > 0 else 0 for embeddings2 in all_embeddings] for embeddings1 in tqdm(all_embeddings)]

# Convert the similarity matrix into a dataframe
similarity_df = pd.DataFrame(similarity_matrix, index=df['product_name'], columns=df['product_name'])

  0%|          | 0/7549 [00:00<?, ?it/s]

In [None]:
similarity_df.to_csv("P-P similarity.csv")

In [None]:
# Open a file for writing
with open('product_embeddings.txt', 'w') as f:
    # Iterate over the list and write each item to the file
    for item in all_embeddings:
        f.write(f'{item}\n')

In [None]:
similarity_df

product_name,Fragrance Discovery Set,La Habana Eau de Parfum,Rainbow Bar Eau de Parfum,Kasbah Eau de Parfum,Purple Haze Eau de Parfum,Kasbah Eau de Parfum Travel Spray,Purple Haze Eau de Parfum Travel Spray,Invisible Post Eau de Parfum,Capri Eau de Parfum,Invisible Post Eau de Parfum Travel Spray,...,L'Homme Le Parfum,Mascara Volume Effet Faux Cils Radical,Y Eau Fraiche,Black Opium Gift Set,Libre Eau de Parfum Gift Set,Couture Clutch Eyeshadow Palette,L'Homme Eau de Parfum,Mon Paris Eau de Parfum Gift Set,Y Eau de Parfum Gift Set,Candy Glaze Lip Gloss Stick Duo with Hyaluronic Acid
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Fragrance Discovery Set,1.000000,0.572612,0.586774,0.586774,0.779218,0.586774,0.779218,0.572612,1.000000,0.572612,...,0,0,0,0,0,0,0,0,0,0
La Habana Eau de Parfum,0.572612,1.000000,0.860464,0.860464,0.635673,0.860464,0.635673,1.000000,0.572612,1.000000,...,0,0,0,0,0,0,0,0,0,0
Rainbow Bar Eau de Parfum,0.586774,0.860464,1.000000,1.000000,0.682025,1.000000,0.682025,0.860464,0.586774,0.860464,...,0,0,0,0,0,0,0,0,0,0
Kasbah Eau de Parfum,0.586774,0.860464,1.000000,1.000000,0.682025,1.000000,0.682025,0.860464,0.586774,0.860464,...,0,0,0,0,0,0,0,0,0,0
Purple Haze Eau de Parfum,0.779218,0.635673,0.682025,0.682025,1.000000,0.682025,1.000000,0.635673,0.779218,0.635673,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Couture Clutch Eyeshadow Palette,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
L'Homme Eau de Parfum,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
Mon Paris Eau de Parfum Gift Set,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
Y Eau de Parfum Gift Set,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [None]:
products[["product_name","ingredients"]][(products.product_name=="10 Day Results Kit") | (products.product_name=="GENIUS Liquid Collagen Serum") ]

Unnamed: 0,product_name,ingredients
90,GENIUS Liquid Collagen Serum,"['Collagen (Vegan)*, Water (Aqua, Eau), Propan..."
97,10 Day Results Kit,"['GENIUS Liquid Collagen:', 'Collagen (Vegan),..."


In [None]:
import numpy as np
np.save('my_file.npy', all_embeddings)

  arr = np.asanyarray(arr)
