In [9]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split


In [10]:
train_data = pd.read_csv(r"D:\amazon_dataset\student_resource\dataset\train.csv")

In [11]:
train_data.head()


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [12]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sample_id        75000 non-null  int64  
 1   catalog_content  75000 non-null  object 
 2   image_link       75000 non-null  object 
 3   price            75000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.3+ MB


In [13]:
df = train_data


  ## Steps
  #### Extraction
  Extracts the valuable information from catalog_content: Item Name, Description, Bullet Points, Value and Quantity.
  
  #### Preprocessing steps:
  1. All lower case characters
  2. URL removal
  3. Multiple dots to single dot
  4. Extra spaces to single space
  5. Removes non-alphanumerical chars

  #### Build sentence
   from all the information from catalog content, build sentences for all the products that acts as sentence of the products to input into further steps. Helpful in fine-tuning CLIP type models.

  #### Remove repeated phrases


In [14]:
def preprocess_text(Sentence):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    Sentence = Sentence.lower()
    Sentence = re.sub(url_pattern, "", Sentence)
    Sentence = re.sub(r"\.{2,}", ".", Sentence)
    Sentence = re.sub(r"\s+", " ", Sentence).strip()
    Sentence = re.sub(r"[^a-zA-Z0-9\s]", "", Sentence)
    return Sentence

In [15]:
# seperate the bullet points, item id, value, quantity and descriptions,

In [16]:
def extract_key_value_pairs(text):
    pattern = r'([A-Za-z0-9 +*#\'"–”“\-]+?):\s*(.*?)(?=\n[A-Za-z0-9 +*#\'"–”“\-]+?:|$)'
    if not isinstance(text, str):
        return {}
    matches = re.findall(pattern, text, flags=re.DOTALL)
    return {k.strip(): v.strip() for k, v in matches}


df["extracted"] = df["catalog_content"].apply(extract_key_value_pairs)


In [17]:
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,extracted
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,{'Item Name': 'La Victoria Green Taco Sauce Mi...
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,"{'Item Name': 'Salerno Cookies, The Original B..."
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,"{'Item Name': 'Bear Creek Hearty Soup Bowl, Cr..."
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,{'Item Name': 'Judee’s Blue Cheese Powder 11.2...
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,"{'Item Name': 'kedem Sherry Cooking Wine, 12.7..."


In [None]:
# combine all the descriptions, bullet points and id.

In [18]:
df = df.join(pd.json_normalize(df["extracted"]))
df.drop(columns=["extracted"], inplace=True)
df.head()
bullet_cols = [col for col in df.columns if re.match(r"(?i)bullet point", col)]

df["Bullet_Points"] = df[bullet_cols].apply(
    lambda row: "\n".join(
        str(v).strip()
        for v in row
        if pd.notnull(v) and str(v).strip() != ""
    )
    if any(pd.notnull(v) and str(v).strip() != "" for v in row)
    else np.nan,
    axis=1
)
df["Bullet_Points"] = (
    df["Bullet_Points"]
    .str.replace(r"(?i)Bullet Point\s*\d*[:\-]*\s*", "", regex=True)
    .str.strip()
)

df.drop(columns=bullet_cols, inplace=True)
item_cols = [col for col in df.columns if re.match(r"(?i)item name", col)]

df["Item_Name"] = df[item_cols].apply(
    lambda row: "\n".join(
        str(v).strip()
        for v in row
        if pd.notnull(v) and str(v).strip() != ""
    )
    if any(pd.notnull(v) and str(v).strip() != "" for v in row)
    else np.nan,
    axis=1
)
df["Item_Name"] = (
    df["Item_Name"]
    .str.replace(r"(?i)Item Name\s*\d*[:\-]*\s*", "", regex=True)
    .str.strip()
)

df.drop(columns=item_cols, inplace=True)


desc_cols = [col for col in df.columns if re.match(r"(?i)product description", col)]

df["ProductDesc"] = df[desc_cols].apply(
    lambda row: "\n".join(
        str(v).strip()
        for v in row
        if pd.notnull(v) and str(v).strip() != ""
    )
    if any(pd.notnull(v) and str(v).strip() != "" for v in row)
    else np.nan,
    axis=1
)
df["ProductDesc"] = (
    df["ProductDesc"]
    .str.replace(r"(?i)product description\s*\d*[:\-]*\s*", "", regex=True)
    .str.strip()
)

df.drop(columns=desc_cols, inplace=True)

In [19]:
# remove the gibberishs and preprocess the texts.

In [20]:
df = df.map(
    lambda x: re.sub(r"[^\x00-\x7F]+", " ", x).strip() if isinstance(x, str) else x
)

df["Item_Name"] = df["Item_Name"].astype(str).apply(preprocess_text)
df["Bullet_Points"] = df["Bullet_Points"].astype(str).apply(preprocess_text)
df["ProductDesc"] = df["ProductDesc"].astype(str).apply(preprocess_text)


In [21]:
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,Value,Unit,Bullet_Points,Item_Name,ProductDesc
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,72.0,Fl Oz,,la victoria green taco sauce mild 12 ounce pac...,
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,32.0,Ounce,original butter cookies classic butter cookies...,salerno cookies the original butter cookies 8 ...,
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,11.4,Ounce,loaded with hearty long grain wild rice and ve...,bear creek hearty soup bowl creamy chicken wit...,
3,55858,Item Name: Judee s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,11.25,Ounce,add to your favorite appetizers dips spreads ...,judee s blue cheese powder 1125 oz glutenfree...,judees powdered blue cheese cheddar cheese pow...
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,12.0,Count,kedem sherry cooking wine 127 ounce 12 per case,kedem sherry cooking wine 127 ounce 12 per case,


In [22]:
# form a complete sentence to give input to embed it using siglip and also remove resundant sentences. 

In [23]:
def build_sentence(row):
    def clean(x):
        if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "null", ""]:
            return ""
        return str(x).strip()
    
    name = clean(row.get("Item_Name", ""))
    desc = clean(row.get("ProductDesc", ""))
    bullets = clean(row.get("Bullet_Points", ""))
    value = clean(row.get("Value", ""))
    unit = clean(row.get("Unit", ""))

    if bullets:
        bullets = re.sub(r'[•;|]+', '\n', bullets) 
    if desc:
        desc = re.sub(r'[•;|]+', '\n', desc)

    parts = []
    if name:
        parts.append(name)
    if desc:
        parts.append(desc)
    if bullets:
        parts.append(bullets)
    if value or unit:
        parts.append(f"{value} {unit}".strip())
    
    text = "\n".join([p for p in parts if p])
    
    text = re.sub(r'\n{2,}', '\n', text).strip()
    
    return text


In [24]:
df["text"] = df.apply(build_sentence, axis=1)


In [25]:
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,Value,Unit,Bullet_Points,Item_Name,ProductDesc,text
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,72.0,Fl Oz,,la victoria green taco sauce mild 12 ounce pac...,,la victoria green taco sauce mild 12 ounce pac...
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,32.0,Ounce,original butter cookies classic butter cookies...,salerno cookies the original butter cookies 8 ...,,salerno cookies the original butter cookies 8 ...
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,11.4,Ounce,loaded with hearty long grain wild rice and ve...,bear creek hearty soup bowl creamy chicken wit...,,bear creek hearty soup bowl creamy chicken wit...
3,55858,Item Name: Judee s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,11.25,Ounce,add to your favorite appetizers dips spreads ...,judee s blue cheese powder 1125 oz glutenfree...,judees powdered blue cheese cheddar cheese pow...,judee s blue cheese powder 1125 oz glutenfree...
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,12.0,Count,kedem sherry cooking wine 127 ounce 12 per case,kedem sherry cooking wine 127 ounce 12 per case,,kedem sherry cooking wine 127 ounce 12 per ca...


In [26]:
def remove_repeated_phrases(text):
    if pd.isna(text):
        return text

    parts = re.split(r',|\n| {2,}', text.strip())

    seen = set()
    unique_parts = []
    for part in parts:
        p = part.strip()
        if p and p not in seen:
            seen.add(p)
            unique_parts.append(p)
    cleaned_text = '. '.join(unique_parts)
    if cleaned_text and not cleaned_text.endswith('.'):
        cleaned_text += '.'

    return cleaned_text

In [27]:
df['cleaned_text'] = df['text'].apply(remove_repeated_phrases)


In [None]:
df.to_csv("cleaned_sample.csv", index=False)


In [29]:
# save to train and validation.

In [None]:
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)