In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.decomposition import PCA
from datetime import datetime

# Load data
train_df = pd.read_csv("data/train.csv", sep=";")
test_df = pd.read_csv("data/test.csv", sep=";")

# Remove unnamed columns from test
test_df = test_df.loc[:, ~test_df.columns.str.contains("^Unnamed")]

# Feature engineering function
def engineer_features(df):
    df = df.copy()
    
    # Temporal features
    df['phase_in_dt'] = pd.to_datetime(df['phase_in'], format='%d/%m/%Y', errors='coerce')
    df['phase_out_dt'] = pd.to_datetime(df['phase_out'], format='%d/%m/%Y', errors='coerce')
    df['phase_in_month'] = df['phase_in_dt'].dt.month
    df['phase_in_dayofyear'] = df['phase_in_dt'].dt.dayofyear
    df['phase_out_month'] = df['phase_out_dt'].dt.month
    
    # Seasons
    df['launch_winter'] = df['phase_in_month'].isin([12, 1, 2]).astype(int)
    df['launch_spring'] = df['phase_in_month'].isin([3, 4, 5]).astype(int)
    df['launch_summer'] = df['phase_in_month'].isin([6, 7, 8]).astype(int)
    df['launch_fall'] = df['phase_in_month'].isin([9, 10, 11]).astype(int)
    
    # Color features
    def parse_rgb(rgb_str):
        if pd.isna(rgb_str) or rgb_str == '':
            return [128, 128, 128]
        try:
            return [int(x) for x in str(rgb_str).split(',')]
        except:
            return [128, 128, 128]
    
    rgb_values = df['color_rgb'].apply(parse_rgb)
    df['color_r'] = rgb_values.apply(lambda x: x[0])
    df['color_g'] = rgb_values.apply(lambda x: x[1])
    df['color_b'] = rgb_values.apply(lambda x: x[2])
    df['color_brightness'] = (df['color_r'] + df['color_g'] + df['color_b']) / 3
    df['color_saturation'] = df[['color_r', 'color_g', 'color_b']].std(axis=1)
    df['is_dark_color'] = (df['color_brightness'] < 100).astype(int)
    
    # Drop original date and color columns
    df = df.drop(columns=['phase_in', 'phase_out', 'color_rgb', 
                          'phase_in_dt', 'phase_out_dt'], errors='ignore')
    
    return df

# Apply feature engineering
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

# Handle image embeddings with PCA
def parse_embeddings(emb_str):
    if pd.isna(emb_str) or emb_str == '':
        return np.zeros(512)
    try:
        return np.array([float(x) for x in str(emb_str).split(',')])
    except:
        return np.zeros(512)

# Extract embeddings
train_embeddings = np.vstack(train_df['image_embedding'].apply(parse_embeddings))
test_embeddings = np.vstack(test_df['image_embedding'].apply(parse_embeddings))

# Apply PCA
pca = PCA(n_components=30)
train_pca = pca.fit_transform(train_embeddings)
test_pca = pca.transform(test_embeddings)

# Add PCA features
for i in range(30):
    train_df[f'img_pca_{i}'] = train_pca[:, i]
    test_df[f'img_pca_{i}'] = test_pca[:, i]

# Remove unwanted columns
cols_to_drop = ["image_embedding", "num_stores", "num_sizes", "weekly_demand", "ID"]
train_df = train_df.drop(columns=[c for c in cols_to_drop if c in train_df.columns])

# Identify categorical features
categorical_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
categorical_cols = [c for c in categorical_cols if c != "Production"]

# Prepare training data
X_train = train_df.drop(columns=["Production"])
y_train = train_df["Production"]
X_train = X_train.fillna(0)

# Train CatBoost model with optimized parameters
model = CatBoostRegressor(
    iterations=800,
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=5,
    loss_function="RMSE",
    random_seed=42,
    verbose=100
)
model.fit(X_train, y_train, cat_features=categorical_cols)

# Prepare test data
test_ids = test_df["ID"]
test_df = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])

# Align columns
for col in X_train.columns:
    if col not in test_df.columns:
        test_df[col] = 0

X_test = test_df[X_train.columns].fillna(0)

# Predict with slight adjustment
preds = model.predict(X_test)

# Apply 1.08x multiplier to reduce underprediction penalty
preds = preds * 1.08

# Create submission
output_df = pd.DataFrame({
    "ID": test_ids,
    "Production": preds.astype(int)
})

output_df.to_csv("submissions/submission_catboost_v1.csv", index=False)
print("✅ Submission created: submissions/submission_catboost_v1.csv")
print(f"Predictions range: {preds.min():.0f} to {preds.max():.0f}")
print(f"Mean prediction: {preds.mean():.0f}")

0:	learn: 34034.4170314	total: 236ms	remaining: 3m 8s
100:	learn: 13617.7428909	total: 10.4s	remaining: 1m 12s
200:	learn: 11980.1771744	total: 19.9s	remaining: 59.4s
300:	learn: 11040.9799847	total: 29.2s	remaining: 48.5s
400:	learn: 10293.6869826	total: 38.8s	remaining: 38.7s
500:	learn: 9702.3405543	total: 48.4s	remaining: 28.9s
600:	learn: 9160.0305464	total: 58.1s	remaining: 19.2s
700:	learn: 8691.3953763	total: 1m 7s	remaining: 9.54s
799:	learn: 8316.6899172	total: 1m 17s	remaining: 0us
✅ Submission created: submissions/submission_catboost_v1.csv
Predictions range: -2814 to 173024
Mean prediction: 15698
