In [None]:
!pip install clean-text



In [None]:
import pandas as pd
import pandas as pd
from cleantext import clean
import pandas as pd
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
from tqdm import tqdm  # For progress bar




When we received the data we analyzed it and came to a conclusion that we need to clean, sort and handle the data in order to work with the same. The provided Python code aims to identify and handle duplicate entries in a dataset of chemical names. The primary steps of the code are as follows:

Data Loading and Preprocessing:
•	The code begins by loading a dataset from an Excel file (kungul_ingridients_new.xlsx). It ensures data cleanliness by checking for duplicate values and confirms the presence of necessary columns for optimization.


In [None]:
# importing dataset which earlier was checked for duplicates in excel, and free of necessary columns in terms of optimality

df = pd.read_excel('kungul_ingridients_new.xlsx')

df

Unnamed: 0,IngredientIdentifier,Name,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
0,G00001,Saccharomyces/Leuconostoc/Apple Fruit/Carrot R...,False,False,False,False
1,G00002,Lactobacillus/Centella Asiatica/Gleditsia Sine...,False,False,False,False
2,G00003,Bacillus/Cordyceps Sinensis/Ganoderma Lucidum/...,False,False,False,False
3,G00004,Ziziphus Spina-Christi Leaf,False,False,False,False
4,G00005,Zingiber Officinale Water,False,False,False,False
...,...,...,...,...,...,...
14695,G09915,Lysine Carboxymethyl Cysteinate,False,False,False,False
14696,G09916,Lysine Thiazolidine Carboxylate,False,False,False,False
14697,G09917,Palmitoyl Myristyl Serinate,False,False,False,False
14698,G09918,Piperonyl Glucoside,False,False,False,False


Next on our list was the Ingredient Check:
•	The script identifies specific ingredients of interest by searching for them within the dataset. This is useful for tracking and analyzing the presence of particular substances.


In [None]:
# List of ingredients to check if there are in our dataframe, these are found on net like ingidients on black list
ingredients_to_check = [
    "Titanium Dioxide (inhalable form)",
    "Silica (inhalable form)",
    "Retinol (Vitamin A)",
    "Butylated Hydroxytoluene (BHT)",
    "Propylparaben",
    "Butyl Acetate",
    "Ethyl Acetate",
    "Methylparaben",
    "Benzophenone-1",
    "Carbon Black",
    "Butylphenyl Methylpropional",
    "Diazolidinyl Urea",
    "n-Butyl Alcohol",
    "Benzyl Salicylate",
    "DMDM Hydantoin",
    "Triethanolamine (TEA)",
    "Cyclotetrasiloxane",
    "Ethylhexyl Methoxycinnamate",
    "Butylparaben",
    "Dioctyl Adipate",
    "Sodium Hydroxymethylglycinate",
    "Benzophenone-3 or Oxybenzone",
    "Ethylparaben",
    "Cyclopentasiloxane",
    "Imidazolidinyl Urea",
    "Perfluorononyl Dimethicone",
    "Polyacrylamide",
    "Triphenyl Phosphate (TPP)",
    "p-Phenylenediamine",
    "Ethylhexyl Dimethyl PABA",
    "Homosalate",
    "FD&C Red n4",
    "Sodium Borate",
    "Resorcinol",
    "Polytetrafluoroethylene (PTFE)",
    "Naphtha-Petroleum",
    "Benzaldehyde",
    "Butylated Hydroxyanisole (BHA)",
    "Chromium Oxide",
    "Didecyldimonium Chloride",
    "Isoeugenol",
    "Benzophenone-2",
    "Methyl Methacrylate",
    "Propanol",
    "Perfluorooctyl Triethoxysilane",
    "Acetaldehyde",
    "Hydroquinone",
    "Triclosan",
    "Cocamide DEA",
    "Quaternium-15",
    "Tert-Butyl Alcohol (TBA)"
]

# Check if each ingredient is in the dataset
found_ingredients = df[df['Name'].isin(ingredients_to_check)]

# Print or further process the found ingredients
print("Found Ingredients:")
print(found_ingredients)

Found Ingredients:
      IngredientIdentifier                            Name  Carcinogens  \
1654                G01659                   Ethyl Acetate        False   
2101                G02107                   Butyl Acetate        False   
3529                G05319      Perfluorononyl Dimethicone        False   
3530                G05320  Perfluorooctyl Triethoxysilane        False   
3574                G05365                  Polyacrylamide        False   
4307                G06311                    Carbon Black        False   
5077                G07257             Retinol (Vitamin A)        False   
6389                O03179             Methyl Methacrylate        False   
6702                R03230                    Butylparaben        False   
6703                R03231     Butylphenyl Methylpropional        False   
6717                R03245              Cyclopentasiloxane        False   
6718                R03246              Cyclotetrasiloxane        False   
6719  

Add New Ingredients:
•	After identifying ingredients already present in the dataset, the code adds new ingredients, if any, along with associated information such as 'Carcinogens', 'EndocrineDisruptors', 'Allergen', and 'SkinIrritant'


In [None]:
# Filter out ingredients that are already in the DataFrame
new_ingredients = [ingredient for ingredient in ingredients_to_check if ingredient not in df['Name'].values]

# Add new ingredients to the DataFrame
if new_ingredients:
    new_data = {
        'IngredientIdentifier': range(len(df), len(df) + len(new_ingredients)),
        'Name': new_ingredients,
        'Carcinogens': 'True',
        'EndocrineDisruptors': 'False',
        'Allergen': 'False',
        'SkinIrritant': 'False'
    }

    new_data_df = pd.DataFrame(new_data)

    # Concatenate the new data to the original DataFrame
    df = pd.concat([df, new_data_df], ignore_index=True)

# Display the updated DataFrame
print(df)

df.head()

      IngredientIdentifier                                               Name  \
0                   G00001  Saccharomyces/Leuconostoc/Apple Fruit/Carrot R...   
1                   G00002  Lactobacillus/Centella Asiatica/Gleditsia Sine...   
2                   G00003  Bacillus/Cordyceps Sinensis/Ganoderma Lucidum/...   
3                   G00004                        Ziziphus Spina-Christi Leaf   
4                   G00005                          Zingiber Officinale Water   
...                    ...                                                ...   
14716                14716                                     Chromium Oxide   
14717                14717                           Didecyldimonium Chloride   
14718                14718                                           Propanol   
14719                14719                                       Cocamide DEA   
14720                14720                           Tert-Butyl Alcohol (TBA)   

      Carcinogens Endocrine

Unnamed: 0,IngredientIdentifier,Name,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
0,G00001,Saccharomyces/Leuconostoc/Apple Fruit/Carrot R...,False,False,False,False
1,G00002,Lactobacillus/Centella Asiatica/Gleditsia Sine...,False,False,False,False
2,G00003,Bacillus/Cordyceps Sinensis/Ganoderma Lucidum/...,False,False,False,False
3,G00004,Ziziphus Spina-Christi Leaf,False,False,False,False
4,G00005,Zingiber Officinale Water,False,False,False,False


Text Cleaning:
•	The 'Name' column, containing ingredient names, undergoes a cleaning process. Unwanted characters, symbols, or strings are removed, and the text is converted to lowercase. This ensures uniformity and consistency in the representation of ingredient names.


In [None]:
# remove any unwanted characters, symbols, or strings that are not relevant to your analysis
df['Name'] = df['Name'].str.replace('unwanted_string', '')

# Assuming 'df' is your DataFrame and 'Name' is the column to be cleaned
df['Name'] = df['Name'].astype(str)  # Ensure the column is of string type

# Clean the 'Name' column
df['Name'] = df['Name'].apply(lambda x: clean(x, no_punct=True, lower=True))

# Remove rows with empty strings in the 'Name' column
df = df[df['Name'] != '']

Handling Missing Values:
•	Rows with missing values in the 'Name' column are dropped. This step ensures data integrity and removes instances where essential information is not available.


In [None]:
# identify and handle missing values, depending on your dataset and the nature of missing data, you might choose to remove or impute missing values
df.dropna(subset=['Name'], inplace=True)  # Drop rows with missing values in a specific column

Counting True Values:
•	The script counts the number of rows where specific columns ('Carcinogens', 'EndocrineDisruptors', 'Allergen', 'SkinIrritant') have the value 'True'. This provides insights into the prevalence of different characteristics among ingredients.


In [None]:
# number of rows in the DataFrame df where the 'Carcinogens' column has the value 'True'
count_true_carcinogens = df[df['Carcinogens'] == True].shape[0]
print(count_true_carcinogens)

50


In [None]:
# number of rows in the DataFrame df where the 'EndocrineDisruptors' column has the value 'True'
count_true_endocrines = df[df['EndocrineDisruptors'] == True].shape[0]
print(count_true_endocrines)

149


In [None]:
# number of rows in the DataFrame df where the 'Carcinogens' column has the value 'True'
count_true_allergen = df[df['Allergen'] == True].shape[0]
print(count_true_allergen)

50


In [None]:
# number of rows in the DataFrame df where the 'Carcinogens' column has the value 'True'
count_true_skinirritant = df[df['SkinIrritant'] == True].shape[0]
print(count_true_skinirritant)

126


Word Embeddings with Word2Vec:
•	The code utilizes Word2Vec, a powerful technique for word embeddings, to find synonyms for each ingredient name. A pre-trained Word2Vec model from Google News is loaded, and synonyms are determined for each ingredient. The results are stored in a new column called 'Synonyms'.


Word2vec is not a singular algorithm, rather, it is a family of model architectures and optimizations that can be used to learn word embeddings from large datasets. Embeddings learned through word2vec have proven to be successful on a variety of downstream natural language processing tasks.

In [None]:
word2vec_path = api.load("word2vec-google-news-300", return_path=True)

In [None]:
# Load the pre-trained Word2Vec model from Google
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Assuming 'program_df' is your DataFrame with a 'Name' column
names = df['Name'].tolist()

# Function to find synonyms for a given word using Word2Vec
def find_synonyms(word, model, topn=5):
    try:
        synonyms = model.most_similar(word, topn=topn)
        return ', '.join([synonym for synonym, _ in synonyms])
    except KeyError:
        return ''

In [None]:
# Create a new column 'Synonyms' with synonyms for each name
synonyms= [find_synonyms(name, word2vec_model) for name in tqdm(names, desc="Finding synonyms")]
df['Synonyms'] = synonyms

Finding synonyms: 100%|██████████| 14721/14721 [04:49<00:00, 50.76it/s]


In [None]:
df

Unnamed: 0,IngredientIdentifier,Name,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant,Synonyms
0,G00001,saccharomycesleuconostocapple fruitcarrot root...,False,False,False,False,
1,G00002,lactobacilluscentella asiaticagleditsia sinens...,False,False,False,False,
2,G00003,bacilluscordyceps sinensisganoderma lucidumino...,False,False,False,False,
3,G00004,ziziphus spinachristi leaf,False,False,False,False,
4,G00005,zingiber officinale water,False,False,False,False,
...,...,...,...,...,...,...,...
14716,14716,chromium oxide,True,False,False,False,
14717,14717,didecyldimonium chloride,True,False,False,False,
14718,14718,propanol,True,False,False,False,"hydroxyphenyl, isopropyl, propyl, methoxy, glu..."
14719,14719,cocamide dea,True,False,False,False,


Exporting Data:
•	The final processed DataFrame is saved to a CSV file (df_TIM1_FINAL.csv). This allows for easy storage, sharing, and future analysis of the enriched ingredient dataset.


In [None]:
df.to_csv('/content/df_TIM1_FINAL.csv', index=False)

In [None]:
# we count the number of rows(ingridients) for which we found synonyms
non_empty_synonyms_count = df[df['Synonyms'].notna() & (df['Synonyms'] != '')]['Synonyms'].count()
print(non_empty_synonyms_count)

502


The script so far, outputs the count of non-empty synonyms, providing a summary statistic indicating the extent to which synonyms were successfully identified.
In summary, this code not only ensures the cleanliness and integrity of the ingredient dataset but also enriches it with additional information such as synonyms using advanced natural language processing techniques like Word2Vec embeddings. This can enhance the dataset's utility for various downstream applications, such as similarity analysis or recommendation systems.
