In [43]:
import numpy as np
import pandas as pd
import re
from scipy.stats import skew

* Dataset Link -> https://www.kaggle.com/datasets/raghavdharwal/amazon-ml-challenge-2025

In [44]:
train = pd.read_csv("/kaggle/input/amazon-ml-challenge-2025/student_resource/dataset/train.csv")

In [45]:
train.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judeeâ€™s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [46]:
del train["sample_id"]

In [47]:
train.head()

Unnamed: 0,catalog_content,image_link,price
0,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,Item Name: Judeeâ€™s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [48]:
train.catalog_content[0]

'Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)\nValue: 72.0\nUnit: Fl Oz\n'

In [49]:
train.catalog_content[1]

'Item Name: Salerno Cookies, The Original Butter Cookies, 8 Ounce (Pack of 4)\nBullet Point 1: Original Butter Cookies: Classic butter cookies made with real butter\nBullet Point 2: Variety Pack: Includes 4 boxes with 32 cookies total\nBullet Point 3: Occasion Perfect: Delicious cookies for birthdays, weddings, anniversaries\nBullet Point 4: Shareable Treats: Fun to give and enjoy with friends and family\nBullet Point 5: Salerno Brand: Trusted brand of delicious butter cookies since 1925\nValue: 32.0\nUnit: Ounce\n'

In [50]:
train.shape

(75000, 3)

In [51]:
print("Item Name:-\n",train["catalog_content"].str.contains("Item Name").value_counts())
print("_"*45)
print("Bullet Point:-\n",train["catalog_content"].str.contains("Bullet Point 1").value_counts())
print("_"*45)
print("Value:-\n",train["catalog_content"].str.contains("Value").value_counts())
print("_"*45)
print("Unit:-\n",train["catalog_content"].str.contains("Unit").value_counts())

Item Name:-
 catalog_content
True    75000
Name: count, dtype: int64
_____________________________________________
Bullet Point:-
 catalog_content
True     54477
False    20523
Name: count, dtype: int64
_____________________________________________
Value:-
 catalog_content
True    75000
Name: count, dtype: int64
_____________________________________________
Unit:-
 catalog_content
True    75000
Name: count, dtype: int64


In [52]:
train["has_item_name"] = train["catalog_content"].str.contains("Item Name", na=False)
train["has_value"] = train["catalog_content"].str.contains("Value", na=False)
train["has_unit"] = train["catalog_content"].str.contains("Unit", na=False)


In [53]:
missing_any = train[
    (~train["has_item_name"]) |
    (~train["has_value"]) |
    (~train["has_unit"])
]
print("Rows with at least one missing field:", missing_any.shape[0])


Rows with at least one missing field: 0


In [54]:
def single_extraction(pattern, text):
    match = re.search(pattern, text)
    return match.group(1).strip() if match else np.nan

def extract_bullets(text):
    return re.findall(r"Bullet Point \d+:\s*(.*)", text)

def cleaning_text(train, text_col="catalog_content"):
    df = train.copy()

    df["Item Name"] = df[text_col].apply(
        lambda x: single_extraction(r"Item Name:\s*(.*)",x)
    )

    df["Value"] = df[text_col].apply(
        lambda x: single_extraction(r"Value:\s*([\d\.]+)", x)
    ).astype(float)

    df["unit"] = df[text_col].apply(
        lambda x: single_extraction(r"Unit:\s*(.*)", x)
    )

    df["bullets"] = df[text_col].apply(extract_bullets)

    df["num_bullets"] = df["bullets"].apply(len)

    for i in range(1, 6):
        df[f"bullet_{i}"] = df["bullets"].apply(
            lambda x: x[i-1] if len(x) >= i else np.nan
        )

    bullet_cols = [f"bullet_{i}" for i in range(1, 6)]

    df["combined_text"] = (
        df["Item Name"].fillna("") + " " +
        df[bullet_cols].fillna("").agg(" ".join, axis=1) + " " +
        df["unit"].fillna("")
    ).str.strip()
    df.drop(columns=["bullets"], inplace=True)

    return df
    

In [55]:
train = cleaning_text(train)


In [56]:
train["catalog_content"][0]

'Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)\nValue: 72.0\nUnit: Fl Oz\n'

In [57]:
train["combined_text"][0]

'La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)      Fl Oz'

In [58]:
print("Missing Values(Combined Text):-/n",train["combined_text"].isnull().sum())
print("Missing Values(Value):-/n",train["Value"].isnull().sum())
print("Missing Values(Price):-/n",train["price"].isnull().sum())

Missing Values(Combined Text):-/n 0
Missing Values(Value):-/n 940
Missing Values(Price):-/n 0


In [59]:
train["Value"] = train["Value"].fillna(train["Value"].median())

In [60]:
print("Missing Values(Combined Text):-/n",train["combined_text"].isnull().sum())
print("Missing Values(Value):-/n",train["Value"].isnull().sum())
print("Missing Values(Price):-/n",train["price"].isnull().sum())

Missing Values(Combined Text):-/n 0
Missing Values(Value):-/n 0
Missing Values(Price):-/n 0


In [61]:
print("Skewness of Price Column", train["price"].skew())
print("Kurtosis of price column", train["price"].kurtosis())

Skewness of Price Column 13.601388975432753
Kurtosis of price column 736.6545083222634


-> it is right skewed and having higher kurtosis

-> checking for which part is giving more skewness and kurtosis.

In [62]:
print("Price Distribution")
print("percentiles:")
for p in [5,10,25,50,75,90,95,99,99.5,99.9]:
    val = train["price"].quantile(p/100)
    print(f"  {p:5.1f}th: ${val:8.2f}")

Price Distribution
percentiles:
    5.0th: $    2.44
   10.0th: $    3.57
   25.0th: $    6.79
   50.0th: $   14.00
   75.0th: $   28.62
   90.0th: $   52.30
   95.0th: $   75.71
   99.0th: $  145.25
   99.5th: $  183.70
   99.9th: $  322.12


#### price Distribution by $200 bins

In [63]:
bin_size = 200
max_price = train['price'].max()
num_bins = int(np.ceil(max_price / bin_size))

bins = [i * bin_size for i in range(num_bins + 1)]
labels = [f"${i*bin_size}-${(i+1)*bin_size}" for i in range(num_bins)]
train['price_bin'] = pd.cut(train['price'], bins=bins, labels=labels, include_lowest=True, right=False)
bin_counts = train['price_bin'].value_counts().sort_index()

print(f"ðŸ“Š Price Distribution by ${bin_size} Bins:")
print(f"Price range: ${train['price'].min():.2f} - ${train['price'].max():.2f}\n")

print(f"{'Price Range':<25} {'Count':<15} {'Percentage':<15} {'Cumulative %':<15}")
print("=" * 70)
cumulative = 0
for bin_label, count in bin_counts.items():
    cumulative += count
    percentage = (count / len(train)) * 100
    cum_percentage = (cumulative / len(train)) * 100
    print(f"{str(bin_label):<25} {count:<15,} {percentage:>6.2f}%{'':<8} {cum_percentage:>6.2f}%")

print("\n" + "=" * 70)
print(f"{'TOTAL':<25} {len(train):<15,} {'100.00%':<15} {'100.00%':<15}")


ðŸ“Š Price Distribution by $200 Bins:
Price range: $0.13 - $2796.00

Price Range               Count           Percentage      Cumulative %   
$0-$200                   74,702           99.60%          99.60%
$200-$400                 258               0.34%          99.95%
$400-$600                 29                0.04%          99.99%
$600-$800                 6                 0.01%          99.99%
$800-$1000                1                 0.00%          99.99%
$1000-$1200               2                 0.00%         100.00%
$1200-$1400               1                 0.00%         100.00%
$1400-$1600               0                 0.00%         100.00%
$1600-$1800               0                 0.00%         100.00%
$1800-$2000               0                 0.00%         100.00%
$2000-$2200               0                 0.00%         100.00%
$2200-$2400               0                 0.00%         100.00%
$2400-$2600               0                 0.00%         100.00%

* Most of the price range is between $0 - $200 which is 99.60% of data.
* Only 0.4% data is above $200.

* So, we are going to clip the data rather than remove higher price data.
* 99.6% of prices are below $200, while only 0.4% exceed this range.Instead of removing these rare high-price samples, we apply clipping to cap extreme values, preserving data while reducing the effect of outliers.


In [64]:
upper_limit = 200
train["price_clipped"] = train["price"].clip(upper=upper_limit)

* After clipping the extreme price values, we perform a log transformation on the target variable. This reduces the impact of skewness, compresses large values, and makes the distribution more suitable for machine learning models. As a result, the model becomes more stable during training and can still produce reasonable predictions for higher-priced products in real-world scenarios.

In [65]:
train["price_log"] = np.log1p(train["price_clipped"])

In [66]:
print("Skewness and Kurtosis Before transformation:-")
print("skewness:",train["price"].skew())
print("Kurtosis:",train["price"].kurtosis())
print("_"*50)
print("Skewness and Kurtosis After transformation:-")
print("skewness:",train["price_log"].skew())
print("Kurtosis:",train["price_log"].kurtosis())

Skewness and Kurtosis Before transformation:-
skewness: 13.601388975432753
Kurtosis: 736.6545083222634
__________________________________________________
Skewness and Kurtosis After transformation:-
skewness: 0.1673750354577014
Kurtosis: -0.38384081084833


* After applying clipping and log transformation, the target distribution became nearly symmetric with a skewness of 0.167 and a kurtosis of âˆ’0.38, indicating a stable distribution with minimal outlier influence. This transformation makes the data more suitable for machine learning models and improves training stability.

In [67]:
train.head(1)

Unnamed: 0,catalog_content,image_link,price,has_item_name,has_value,has_unit,Item Name,Value,unit,num_bullets,bullet_1,bullet_2,bullet_3,bullet_4,bullet_5,combined_text,price_bin,price_clipped,price_log
0,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,True,True,True,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",72.0,Fl Oz,0,,,,,,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",$0-$200,4.89,1.773256


In [68]:
train.to_csv("train.csv",index=False)