In [None]:
import pandas as pd
import re
import numpy as np
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, csr_matrix

In [None]:



# ----------------------------
# 5. Combine Features (Text + Quantity)
# ----------------------------
X_train = hstack([X_text_train, train_df[['quantity']].values.astype(np.float64)])
X_test = hstack([X_text_test, test_df[['quantity']].values.astype(np.float64)])
y_train = train_df['price'].values

# ----------------------------
# 6. Log-transform target
# ----------------------------
y_train_log = np.log1p(y_train)

# ----------------------------
# 7. Train-validation split
# ----------------------------
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train_log, test_size=0.2, random_state=42)

lgb_train = lgb.Dataset(X_tr, label=y_tr)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'max_depth': 10,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    early_stopping_rounds=50,
    verbose_eval=50
)

# ----------------------------
# 8. Validation Metrics
# ----------------------------
val_preds_log = model.predict(X_val)
val_preds = np.expm1(val_preds_log)
y_val_orig = np.expm1(y_val)

# Clip to avoid overflow
val_preds = np.clip(val_preds, 0, 1e6)
y_val_orig = np.clip(y_val_orig, 0, 1e6)

# MAE
val_mae = mean_absolute_error(y_val_orig, val_preds)
print(f"Validation MAE: {val_mae:.4f}")

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

val_smape = smape(y_val_orig, val_preds)
print(f"Validation SMAPE: {val_smape:.4f}%")

# ----------------------------
# 9. Predict on Test Set
# ----------------------------
preds_log = model.predict(X_test)
preds = np.expm1(preds_log)
preds = np.clip(preds, 0.01, 1e6)  # ensure positive and prevent overflow

# ----------------------------
# 10. Create Submission CSV
# ----------------------------
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': np.round(preds, 2)
})

submission.to_csv('test_out2.csv', index=False)
print("✅ test_out2.csv generated successfully!")


In [4]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from scipy.sparse import hstack

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [6]:
def extract_quantity(text):
    value_match = re.search(r"Value:\s*([\d.]+)", str(text))
    value = float(value_match.group(1)) if value_match else 1.0
    
    pack_match = re.search(r"pack of (\d+)", str(text).lower())
    pack = int(pack_match.group(1)) if pack_match else 1
    
    return value * pack

train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity).fillna(1)
test_df['quantity'] = test_df['catalog_content'].apply(extract_quantity).fillna(1)

In [7]:
def clean_text(text):
    text = str(text).replace('\n',' ').lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

train_df['clean_text'] = train_df['catalog_content'].apply(clean_text).fillna("unknown")
test_df['clean_text'] = test_df['catalog_content'].apply(clean_text).fillna("unknown")

In [8]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text_train = vectorizer.fit_transform(train_df['clean_text'])
X_text_test = vectorizer.transform(test_df['clean_text'])

In [9]:
X_train = hstack([X_text_train, train_df[['quantity']].values.astype(np.float64)])
X_test = hstack([X_text_test, test_df[['quantity']].values.astype(np.float64)])
y_train = train_df['price'].values


In [10]:
y_train_log = np.log1p(y_train)

In [12]:
# 7. Train-validation split
# ----------------------------
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train_log, test_size=0.2, random_state=42)

lgb_train = lgb.Dataset(X_tr, label=y_tr)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'max_depth': 10,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=50)]
)
# .


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.539940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 893465
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 5001
[LightGBM] [Info] Start training from score 2.740904
Training until validation scores don't improve for 50 rounds
[50]	train's l1: 0.573407	val's l1: 0.599631
[100]	train's l1: 0.537627	val's l1: 0.574169
[150]	train's l1: 0.516186	val's l1: 0.562289
[200]	train's l1: 0.501354	val's l1: 0.555162
[250]	train's l1: 0.489069	val's l1: 0.549707
[300]	train's l1: 0.479196	val's l1: 0.546083
[350]	train's l1: 0.470975	val's l1: 0.543235
[400]	train's l1: 0.463536	val's l1: 0.540766
[450]	train's l1: 0.455982	val's l1: 0.538391
[500]	train's l1: 0.449585	val's l1: 0.536809
[550]	train's l1: 0.444076	val's l1: 0.535647
[600]	train's l1: 0.438502	val's l1: 0.534115
[650]	train's l1: 0.433239	val's l1: 0.532

In [13]:
val_preds_log = model.predict(X_val)
val_preds = np.expm1(val_preds_log)
y_val_orig = np.expm1(y_val)

# Clip to avoid overflow
val_preds = np.clip(val_preds, 0, 1e6)
y_val_orig = np.clip(y_val_orig, 0, 1e6)

# MAE
val_mae = mean_absolute_error(y_val_orig, val_preds)
print(f"Validation MAE: {val_mae:.4f}")

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

val_smape = smape(y_val_orig, val_preds)
print(f"Validation SMAPE: {val_smape:.4f}%")

Validation MAE: 11.7966
Validation SMAPE: 53.0249%




Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.8.0-cp39-cp39-win_amd64.whl (241.2 MB)
     -------------------------------------- 241.2/241.2 MB 1.0 MB/s eta 0:00:00
Collecting sympy>=1.13.3
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ---------------------------------------- 6.3/6.3 MB 1.2 MB/s eta 0:00:00
Installing collected packages: sympy, torch
Successfully installed sympy-1.14.0 torch-2.8.0


In [14]:
preds_log = model.predict(X_test)
preds = np.expm1(preds_log)
preds = np.clip(preds, 0.01, 1e6)  # ensure positive and prevent overflow

# ----------------------------
# 10. Create Submission CSV
# ----------------------------
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': np.round(preds, 2)
})

submission.to_csv('test_out2.csv', index=False)
print("✅ test_out2.csv generated successfully!")




✅ test_out2.csv generated successfully!
