In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from scipy.sparse import hstack

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
def extract_quantity(text):
    value_match = re.search(r"Value:\s*([\d.]+)", str(text))
    value = float(value_match.group(1)) if value_match else 1.0
    
    pack_match = re.search(r"pack of (\d+)", str(text).lower())
    pack = int(pack_match.group(1)) if pack_match else 1
    
    return value * pack

train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity).fillna(1)
test_df['quantity'] = test_df['catalog_content'].apply(extract_quantity).fillna(1)

In [4]:
def clean_text(text):
    text = str(text).replace('\n',' ').lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

train_df['clean_text'] = train_df['catalog_content'].apply(clean_text).fillna("unknown")
test_df['clean_text'] = test_df['catalog_content'].apply(clean_text).fillna("unknown")

In [5]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text_train = vectorizer.fit_transform(train_df['clean_text'])
X_text_test = vectorizer.transform(test_df['clean_text'])

In [6]:
X_train = hstack([X_text_train, train_df[['quantity']].values.astype(np.float64)])
X_test = hstack([X_text_test, test_df[['quantity']].values.astype(np.float64)])
y_train = train_df['price'].values


In [7]:
y_train_log = np.log1p(y_train)

In [None]:
lgb_full = lgb.Dataset(X_train, label=y_train_log)

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.03,
    'num_leaves': 64,
    'max_depth': 10,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'verbosity': -1
}

# train on full data (no early stopping because no separate val set)
model = lgb.train(
    params,
    lgb_full,
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(period=50)]
)


In [None]:
#Train-validation split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train_log, test_size=0.2, random_state=42)

lgb_train = lgb.Dataset(X_tr, label=y_tr)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.03,
    'num_leaves': 64,
    'max_depth': 10,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}



model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=50)]
)
# .

Training until validation scores don't improve for 50 rounds
[50]	train's l1: 0.607029	val's l1: 0.627081
[100]	train's l1: 0.563022	val's l1: 0.591703
[150]	train's l1: 0.542059	val's l1: 0.577201
[200]	train's l1: 0.527177	val's l1: 0.568141
[250]	train's l1: 0.515783	val's l1: 0.561689
[300]	train's l1: 0.506548	val's l1: 0.556942
[350]	train's l1: 0.498453	val's l1: 0.55328
[400]	train's l1: 0.491778	val's l1: 0.55041
[450]	train's l1: 0.485233	val's l1: 0.54763
[500]	train's l1: 0.478729	val's l1: 0.545354
[550]	train's l1: 0.473399	val's l1: 0.543576
[600]	train's l1: 0.468024	val's l1: 0.541622
[650]	train's l1: 0.463595	val's l1: 0.539885
[700]	train's l1: 0.459156	val's l1: 0.538501
[750]	train's l1: 0.454909	val's l1: 0.537144
[800]	train's l1: 0.451079	val's l1: 0.536337
[850]	train's l1: 0.447357	val's l1: 0.535301
[900]	train's l1: 0.444165	val's l1: 0.534448
[950]	train's l1: 0.44115	val's l1: 0.533807
[1000]	train's l1: 0.43789	val's l1: 0.53294
Did not meet early stoppi

In [None]:
val_preds_log = model.predict(X_val)
val_preds = np.expm1(val_preds_log)
y_val_orig = np.expm1(y_val)

# Clip to avoid overflow
val_preds = np.clip(val_preds, 0, 1e6)
y_val_orig = np.clip(y_val_orig, 0, 1e6)

# MAE
val_mae = mean_absolute_error(y_val_orig, val_preds)
print(f"Validation MAE: {val_mae:.4f}")

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

val_smape = smape(y_val_orig, val_preds)
print(f"Validation SMAPE: {val_smape:.4f}%")

In [None]:
preds_log = model.predict(X_test)
preds = np.expm1(preds_log)
preds = np.clip(preds, 0.01, 1e6)  # ensure positive and prevent overflow

#Creating Submission CSV

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': np.round(preds, 2)
})

submission.to_csv('test_out5.csv', index=False)