## Libraries

In [None]:
import pandas as pd
import re

## Load the datasets

In [None]:
branded_foods = pd.read_excel('TestDataset.xlsx', sheet_name='branded_foods')
common_foods = pd.read_excel('TestDataset.xlsx', sheet_name='common_foods')

In [None]:
branded_foods.head()

In [None]:
common_foods.sample(6)

## Preprocessing both datasets

In [None]:
# Preprocessing: convert to lowercase and remove punctuation for better matching
def preprocess(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

branded_foods['branded_name_cleaned'] = branded_foods['branded_name'].apply(preprocess)
common_foods['name_cleaned'] = common_foods['name'].apply(preprocess)
common_foods['category_cleaned'] = common_foods['category'].apply(preprocess)

In [None]:
branded_foods.sample()

In [None]:
common_foods.sample()

## Matching Row Data

In [None]:

# Function to match branded food to common food
def match_common_food(branded_name, branded_category):
    # Try matching by name first
    for _, common_food in common_foods.iterrows():     
        if common_food['name_cleaned'] in branded_name or common_food['category_cleaned'] in branded_category:
            return common_food['id']
    return None

# Apply the matching function to each row in the branded foods DataFrame
branded_foods['common_id'] = branded_foods.apply(
    lambda row: match_common_food(row['branded_name_cleaned'], preprocess(row['branded_category'])), axis=1
)

In [None]:
branded_foods

## Output

In [None]:
# Save the updated DataFrame to a new CSV file
branded_foods.to_csv('branded_foods_with_common_ids.csv', index=False)