# Initial Setup

## Loading Packages

In [1]:
%load_ext cudf.pandas
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from concurrent.futures import ThreadPoolExecutor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# This notebook assumes that train.csv and test.csv are already downloaded

# Train Model & Create Submission

In [6]:
%%time
def process_data_improved(train_df, test_df):
    """
    Process data with high-cardinality categorical features
    """
    # Base feature for values
    base_feature = 'magical'

    # Get consistent feature names
    feature_names = ['magical','trickortreat_encoded', 'kingofhalloween_encoded']

    # Calculate statistics from training data
    base_median = train_df[base_feature].median()
    Q1 = train_df[base_feature].quantile(0.25)
    Q3 = train_df[base_feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Calculate robust target encodings for high-cardinality categorical variables
    cat_encodings = {}
    global_mean = train_df['y'].mean()

    for col in ['trickortreat', 'kingofhalloween']:
        # Group by category and calculate stats
        cat_stats = (train_df.groupby(col)['y']
                    .agg(['mean', 'count'])
                    .reset_index())

        # Only keep categories that appear more than once
        frequent_cats = cat_stats[cat_stats['count'] > 1]

        # Strong smoothing factor due to high cardinality
        smoothing = 100

        # Calculate smoothed means with stronger regularization
        frequent_cats['encoded'] = (
            (frequent_cats['count'] * frequent_cats['mean'] + smoothing * global_mean) /
            (frequent_cats['count'] + smoothing)
        )

        # Create dictionary only for frequent categories
        cat_encodings[col] = dict(zip(frequent_cats[col], frequent_cats['encoded']))

    def process_single_df(df, is_train=True):
        """Process a single dataframe with high-cardinality handling"""
        # Initialize output DataFrame
        df_processed = pd.DataFrame(index=df.index, columns=feature_names)

        # Process base feature
        df_processed['magical'] = df['magical'].fillna(base_median).clip(lower_bound, upper_bound)

        # Process categorical features
        for col in ['trickortreat', 'kingofhalloween']:
            # Map categories to encodings, with special handling for rare/unseen categories
            df_processed[f'{col}_encoded'] = (
                df[col].map(cat_encodings[col])
                .fillna(global_mean)  # Use global mean for rare/unseen categories
            )

        # Add target if available
        if 'y' in df.columns and is_train:
            df_processed['y'] = df['y']

        return df_processed

    print("\nProcessing training data...")
    train_processed = process_single_df(train_df, is_train=True)

    print("\nProcessing test data...")
    test_processed = process_single_df(test_df, is_train=False)

    return train_processed, test_processed

def create_improved_cat_model(train_df, test_df):
    """Create model with high-cardinality categorical features"""
    # Process data
    print("\nProcessing data...")
    train_processed, test_processed = process_data_improved(train_df, test_df)

    # Separate features and target
    y = train_processed['y'].values
    X = train_processed.drop('y', axis=1)

    # Standardize numeric features
    print("\nStandardizing numeric features...")
    scaler = StandardScaler()
    numeric_features = ['magical', 'trickortreat_encoded', 'kingofhalloween_encoded']

    X[numeric_features] = scaler.fit_transform(X[numeric_features])
    test_processed[numeric_features] = scaler.transform(test_processed[numeric_features])

    print(f"\nFeatures being used: {X.columns.tolist()}")
    print(f"Number of training samples: {len(X)}")

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Create and train model with parameters adjusted for high cardinality
    model = lgb.LGBMRegressor(
        objective='rmse',
        n_estimators=2000,
        learning_rate=0.05,
        num_leaves=15,
        random_state=42,
        feature_fraction=0.7,
        bagging_fraction=0.7,
        bagging_freq=5,
        min_child_samples=150,  # Increased to handle high cardinality
        reg_alpha=0.2,          # Increased regularization
        reg_lambda=0.2,         # Increased regularization
        early_stopping_rounds=100
    )

    print("\nTraining model...")

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse'
    )

    print("\nFinished Training model")
    print("\nCreating Test Predictions")

    # Create test predictions
    test_pred = model.predict(test_processed)

    print("\nFinished Creating Test Predictions")

    return {
        'model': model,
        'predictions': test_pred,
        'processed_train': X,
        'processed_test': test_processed,
        'scaler': scaler
    }

df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

results = create_improved_cat_model(df, test_df)

pd.DataFrame({
    'id': test_df['id'],
    'y': results['predictions']
}).to_csv('submission.csv', index=False)



Processing data...

Processing training data...

Processing test data...

Standardizing numeric features...

Features being used: ['magical', 'trickortreat_encoded', 'kingofhalloween_encoded']
Number of training samples: 11000000

Training model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 8800000, number of used features: 3
[LightGBM] [Info] Start training from score 42348.078711
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[512]	valid_0's rmse: 620.809

Finished Training model

Creating Test Predictions

Finished Creating Test Predictions
CPU times: user 7min 57s, sys: 6.83 s, total: 8min 4s
Wall time: 1min 47s
