In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import pickle

In [None]:
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

In [None]:
print("=" * 80)
print("LOADING DATA")
print("=" * 80)

# Read CSV with robust error handling for malformed rows
try:
    # First try: Standard read
    train_df = pd.read_csv('train.csv')
except Exception as e:
    print(f"Standard read failed: {e}")
    print("Trying alternative parsing methods...\n")

    try:
        # Second try: Python engine with quote handling
        train_df = pd.read_csv('train.csv',
                               engine='python',
                               on_bad_lines='skip',
                               quoting=3,  # QUOTE_NONE
                               sep='\t')
        # If tab-separated didn't work, try comma
        if train_df.shape[1] == 1:
            train_df = pd.read_csv('train.csv',
                                   engine='python',
                                   on_bad_lines='skip')
    except:
        try:
            # Third try: Most robust - skip bad lines and warn
            train_df = pd.read_csv('train.csv',
                                   on_bad_lines='warn',
                                   engine='python',
                                   quotechar='"',
                                   escapechar='\\')
        except:
            # Final try: Read line by line
            print("Using line-by-line reading (slowest but most robust)...")
            import csv

            rows = []
            with open('train.csv', 'r', encoding='utf-8', errors='ignore') as f:
                reader = csv.reader(f, quotechar='"', escapechar='\\')
                header = next(reader)

                for i, row in enumerate(reader):
                    try:
                        if len(row) == len(header):
                            rows.append(row)
                    except:
                        print(f"Skipping malformed row {i+1}")
                        continue

            train_df = pd.DataFrame(rows, columns=header)
            # Convert price to numeric
            train_df['price'] = pd.to_numeric(train_df['price'], errors='coerce')

print(f"\n✓ Data loaded successfully!")
print(f"Shape: {train_df.shape}")
print(f"Price range: ${train_df['price'].min():.2f} - ${train_df['price'].max():.2f}")
print(f"Median price: ${train_df['price'].median():.2f}")

LOADING DATA

✓ Data loaded successfully!
Shape: (75000, 4)
Price range: $0.13 - $2796.00
Median price: $14.00


In [None]:
print("\n" + "=" * 80)
print("FEATURE ENGINEERING")
print("=" * 80)

def extract_features(df):
    """Extract comprehensive features optimized for price prediction"""

    df = df.copy()
    df['catalog_content'] = df['catalog_content'].fillna('')

    # ========== CORE PRICING FEATURES (MOST IMPORTANT) ==========

    # Extract Value
    def extract_value(text):
        match = re.search(r'Value:\s*(\d+\.?\d*)', str(text))
        return float(match.group(1)) if match else 1.0

    df['value'] = df['catalog_content'].apply(extract_value)

    # Extract Unit
    def extract_unit(text):
        match = re.search(r'Unit:\s*(.+?)(?:\n|$)', str(text))
        if match:
            unit = match.group(1).strip().lower()
            # Standardize units
            if 'fl oz' in unit or 'fluid ounce' in unit:
                return 'fl_oz'
            elif 'oz' in unit or 'ounce' in unit:
                return 'oz'
            elif 'lb' in unit or 'pound' in unit:
                return 'lb'
            elif 'count' in unit or 'pack' in unit:
                return 'count'
            elif any(x in unit for x in ['ml', 'milliliter']):
                return 'ml'
            elif any(x in unit for x in ['l', 'liter']):
                return 'liter'
            elif any(x in unit for x in ['g', 'gram']):
                return 'gram'
            elif any(x in unit for x in ['kg', 'kilo']):
                return 'kg'
            else:
                return 'other'
        return 'unknown'

    df['unit'] = df['catalog_content'].apply(extract_unit)

    # Extract Pack Quantity (CRITICAL)
    def extract_pack_qty(text):
        text = str(text).lower()
        patterns = [
            r'pack of (\d+)', r'\(pack of (\d+)\)', r'(\d+)\s*pack',
            r'case of (\d+)', r'(\d+)\s*per case', r'box of (\d+)',
            r'set of (\d+)', r'(\d+)\s*count[^\w]'
        ]
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                qty = int(match.group(1))
                if 1 <= qty <= 500:
                    return qty
        return 1

    df['pack_qty'] = df['catalog_content'].apply(extract_pack_qty)

    # ========== KEY CALCULATED FEATURES ==========

    # Unit price (most predictive feature)
    df['value_per_pack'] = df['value'] / df['pack_qty']
    df['log_value'] = np.log1p(df['value'])
    df['log_pack_qty'] = np.log1p(df['pack_qty'])
    df['log_value_per_pack'] = np.log1p(df['value_per_pack'])

    # Standardize value to ounces for comparison
    def value_to_oz(row):
        val = row['value']
        unit = row['unit']
        if unit == 'fl_oz' or unit == 'oz':
            return val
        elif unit == 'lb':
            return val * 16
        elif unit == 'gram':
            return val * 0.035274
        elif unit == 'kg':
            return val * 35.274
        elif unit == 'ml':
            return val * 0.033814
        elif unit == 'liter':
            return val * 33.814
        else:
            return val

    df['value_in_oz'] = df.apply(value_to_oz, axis=1)
    df['log_value_in_oz'] = np.log1p(df['value_in_oz'])
    df['value_in_oz_per_pack'] = df['value_in_oz'] / df['pack_qty']
    df['log_value_in_oz_per_pack'] = np.log1p(df['value_in_oz_per_pack'])

    # ========== TEXT FEATURES ==========

    df['text_length'] = df['catalog_content'].str.len()
    df['word_count'] = df['catalog_content'].str.split().str.len()
    df['num_bullet_points'] = df['catalog_content'].str.count('Bullet Point')
    df['has_product_desc'] = df['catalog_content'].str.contains('Product Description').astype(int)

    # Extract item name
    def extract_item_name(text):
        match = re.search(r'Item Name:\s*(.+?)(?:\n|Bullet Point|Value:|$)', str(text), re.IGNORECASE)
        return match.group(1).strip() if match else ''

    df['item_name'] = df['catalog_content'].apply(extract_item_name)
    df['item_name_length'] = df['item_name'].str.len()
    df['item_name_words'] = df['item_name'].str.split().str.len()

    # ========== PREMIUM/QUALITY INDICATORS ==========

    premium_keywords = {
        'organic': 3.0, 'non-gmo': 2.5, 'natural': 2.0, 'premium': 2.5,
        'gourmet': 3.0, 'artisan': 3.5, 'gluten free': 1.5, 'kosher': 1.5,
        'vegan': 1.5, 'certified': 2.0, 'pure': 1.8, 'authentic': 1.5,
        'fresh': 1.3, 'finest': 2.0, 'deluxe': 2.5, 'professional': 2.8
    }

    def calculate_premium_score(text):
        text = str(text).lower()
        score = 0
        for keyword, weight in premium_keywords.items():
            if keyword in text:
                score += weight * text.count(keyword)
        return score

    df['premium_score'] = df['catalog_content'].apply(calculate_premium_score)
    df['log_premium_score'] = np.log1p(df['premium_score'])

    # Specific premium flags
    for kw in ['organic', 'non_gmo', 'gluten_free', 'kosher', 'vegan']:
        df[f'has_{kw}'] = df['catalog_content'].str.lower().str.contains(kw.replace('_', '-')).astype(int)

    # ========== CATEGORY DETECTION ==========

    categories = {
        'beverage': ['drink', 'beverage', 'juice', 'water', 'soda', 'coffee', 'tea'],
        'supplement': ['vitamin', 'supplement', 'protein', 'nutrition', 'powder'],
        'condiment': ['sauce', 'seasoning', 'spice', 'dressing', 'marinade', 'oil', 'vinegar'],
        'snack': ['chip', 'cookie', 'candy', 'chocolate', 'bar', 'cracker', 'popcorn'],
        'canned': ['canned', 'can', 'jar', 'beans', 'soup'],
        'baking': ['flour', 'sugar', 'baking', 'mix'],
        'cereal': ['cereal', 'granola', 'oat', 'flake']
    }

    for cat, keywords in categories.items():
        df[f'cat_{cat}'] = df['catalog_content'].str.lower().apply(
            lambda x: int(any(kw in x for kw in keywords))
        )

    # ========== NUMBER EXTRACTION ==========

    def extract_numbers(text):
        return [float(x) for x in re.findall(r'\d+\.?\d*', str(text)) if 0 < float(x) < 100000]

    df['all_numbers'] = df['catalog_content'].apply(extract_numbers)
    df['num_count'] = df['all_numbers'].apply(len)
    df['max_number'] = df['all_numbers'].apply(lambda x: max(x) if x else 0)
    df['min_number'] = df['all_numbers'].apply(lambda x: min(x) if x else 0)
    df['mean_number'] = df['all_numbers'].apply(lambda x: np.mean(x) if x else 0)
    df['median_number'] = df['all_numbers'].apply(lambda x: np.median(x) if x else 0)

    # ========== BRAND/MANUFACTURER INDICATORS ==========

    # Well-known value brands
    value_brands = ['amazon', 'kirkland', 'great value', 'equate']
    df['is_value_brand'] = df['catalog_content'].str.lower().apply(
        lambda x: int(any(brand in x for brand in value_brands))
    )

    # ========== INTERACTION FEATURES ==========

    df['value_x_premium'] = df['value'] * df['premium_score']
    df['pack_x_premium'] = df['pack_qty'] * df['premium_score']
    df['value_per_pack_x_premium'] = df['value_per_pack'] * df['premium_score']
    df['bullets_per_pack'] = df['num_bullet_points'] / (df['pack_qty'] + 1)
    df['text_per_value'] = df['text_length'] / (df['value'] + 1)

    df.drop('all_numbers', axis=1, inplace=True)

    return df

train_df = extract_features(train_df)

# Select features
feature_cols = [col for col in train_df.columns
                if col not in ['sample_id', 'catalog_content', 'image_link', 'price',
                               'item_name', 'unit']]

# Encode unit
le_unit = LabelEncoder()
train_df['unit_encoded'] = le_unit.fit_transform(train_df['unit'])
feature_cols.append('unit_encoded')

print(f"Total features: {len(feature_cols)}")


FEATURE ENGINEERING
Total features: 42


In [None]:
print("\n" + "=" * 80)
print("TEXT VECTORIZATION")
print("=" * 80)

# Combine item name and full text
text_data = train_df['item_name'].fillna('') + ' ' + train_df['catalog_content'].fillna('')

# TF-IDF with optimal parameters for pricing
tfidf = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True,
    strip_accents='unicode',
    lowercase=True
)

tfidf_matrix = tfidf.fit_transform(text_data)
print(f"TF-IDF shape: {tfidf_matrix.shape}")

# SVD reduction
n_svd = 100
svd = TruncatedSVD(n_components=n_svd, random_state=42)
tfidf_svd = svd.fit_transform(tfidf_matrix)
print(f"SVD variance explained: {svd.explained_variance_ratio_.sum():.3f}")

tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'tfidf_{i}' for i in range(n_svd)])

# ============================================================================
# 6. PREPARE DATA
# ============================================================================

X_basic = train_df[feature_cols].fillna(0)
X_combined = pd.concat([X_basic.reset_index(drop=True), tfidf_df], axis=1)
y = train_df['price'].values

# Log transform target
y_log = np.log1p(y)

print(f"\nFinal feature matrix: {X_combined.shape}")
print(f"Total features: {X_combined.shape[1]}")


TEXT VECTORIZATION
TF-IDF shape: (75000, 2000)
SVD variance explained: 0.397

Final feature matrix: (75000, 142)
Total features: 142


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
import lightgbm as lgb

# --- SMAPE Function ---
def smape(y_true, y_pred):
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8)
    )

print("\n" + "=" * 80)
print("TRAINING LIGHTGBM (Target SMAPE < 40)")
print("=" * 80)

# --- Tuned LightGBM Parameters ---
lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 180,            # more leaves → higher model complexity
    'learning_rate': 0.008,       # smaller → more precise fitting
    'n_estimators': 8000,         # more boosting rounds with early stop
    'feature_fraction': 0.85,     # use more features per split
    'bagging_fraction': 0.85,     # larger sample usage
    'bagging_freq': 3,
    'min_child_samples': 10,
    'min_child_weight': 0.0005,
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
    'max_depth': -1,
    'verbosity': -1,
    'random_state': 42,
    'n_jobs': -1
}

# --- KFold Setup ---
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(y))
models = []

print("\nTraining folds...")

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_combined), 1):
    print(f"\nFold {fold}/{n_splits}")

    X_train, X_val = X_combined.iloc[train_idx], X_combined.iloc[val_idx]
    y_train, y_val = y_log[train_idx], y_log[val_idx]
    y_val_original = y[val_idx]

    # --- Scaling ---
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # --- Model ---
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        X_train_scaled, y_train,
        eval_set=[(X_val_scaled, y_val)],
        eval_metric='mae',
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=250)
        ]
    )

    # --- Predictions ---
    y_pred_log = model.predict(X_val_scaled, num_iteration=model.best_iteration_)
    y_pred = np.expm1(y_pred_log)
    y_pred = np.maximum(y_pred, 0.01)

    oof_predictions[val_idx] = y_pred

    fold_smape = smape(y_val_original, y_pred)
    fold_scores.append(fold_smape)

    print(f"  ✅ SMAPE: {fold_smape:.4f}%")
    print(f"  📉 Best iteration: {model.best_iteration_}")

    models.append((model, scaler))

mean_smape = np.mean(fold_scores)
std_smape = np.std(fold_scores)

print("\n" + "=" * 80)
print("FINAL CROSS-VALIDATION RESULTS")
print("=" * 80)
print(f"\nFold scores: {[f'{s:.4f}%' for s in fold_scores]}")
print(f"\nMean SMAPE: {mean_smape:.4f}% (+/- {std_smape:.4f}%)")
print(f"OOF SMAPE: {smape(y, oof_predictions):.4f}%")



TRAINING LIGHTGBM (Target SMAPE < 40)

Training folds...

Fold 1/5
[250]	valid_0's l1: 0.591009
[500]	valid_0's l1: 0.561013
[750]	valid_0's l1: 0.548947
[1000]	valid_0's l1: 0.542128
[1250]	valid_0's l1: 0.538016
[1500]	valid_0's l1: 0.534729
[1750]	valid_0's l1: 0.532286
[2000]	valid_0's l1: 0.530135
[2250]	valid_0's l1: 0.528214
[2500]	valid_0's l1: 0.526543
[2750]	valid_0's l1: 0.525055
[3000]	valid_0's l1: 0.523654
[3250]	valid_0's l1: 0.522555
[3500]	valid_0's l1: 0.521541
[3750]	valid_0's l1: 0.520595
[4000]	valid_0's l1: 0.519852
[4250]	valid_0's l1: 0.519141
[4500]	valid_0's l1: 0.518495
[4750]	valid_0's l1: 0.517922
[5000]	valid_0's l1: 0.517436
[5250]	valid_0's l1: 0.517024
[5500]	valid_0's l1: 0.516643
[5750]	valid_0's l1: 0.51625
[6000]	valid_0's l1: 0.515867
[6250]	valid_0's l1: 0.515507
[6500]	valid_0's l1: 0.515276
[6750]	valid_0's l1: 0.514968
[7000]	valid_0's l1: 0.514732
[7250]	valid_0's l1: 0.514497
[7500]	valid_0's l1: 0.514259
[7750]	valid_0's l1: 0.514099
[8000]