In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings

warnings.filterwarnings('ignore')

print("Loading datasets...")
# --- 1. Load Datasets (Using Dask for Large Files) ---
# Load smaller files with pandas
stratified_train_df = pd.read_parquet('/content/drive/MyDrive/stratified_train_data.parquet')
test_df = pd.read_parquet('/content/drive/MyDrive/test_data.parquet')
offer_df = pd.read_parquet('/content/drive/MyDrive/offer_metadata.parquet')

# Load large event and transaction logs with Dask to prevent memory crashes
try:
    event_df_dask = dd.read_parquet('/content/drive/MyDrive/add_event.parquet')
    trans_df_dask = dd.read_parquet('/content/drive/MyDrive/add_trans.parquet')
    print("All datasets loaded successfully (using Dask for large files).")
except Exception as e:
    print(f"Error loading supplementary data: {e}")
    # Terminate if supplementary data isn't available
    exit()

Loading datasets...
All datasets loaded successfully (using Dask for large files).


In [5]:
# --- 2. Feature Engineering from add_event_df (Offer-Level Only) ---
print("\nStarting feature engineering on event data for OFFERS only...")

# Create a 'clicked' column based on whether id7 is null
event_df_dask['clicked'] = (~event_df_dask['id7'].isnull()).astype(int)

# --- Offer-level event features ---
# Group by offer (id3) and aggregate its historical performance
offer_event_features_dask = event_df_dask.groupby('id3').agg(
    offer_total_impressions=('id4', 'count'),
    offer_total_clicks=('clicked', 'sum')
)
# Calculate offer's historical CTR
offer_event_features_dask['offer_historical_ctr'] = (
    offer_event_features_dask['offer_total_clicks'] / offer_event_features_dask['offer_total_impressions']
).fillna(0)

# --- Execute Dask computation ---
print("Computing aggregated offer features...")
offer_event_features = offer_event_features_dask.compute().reset_index()
print("Offer event features created.")


Starting feature engineering on event data for OFFERS only...
Computing aggregated offer features...
Offer event features created.


In [7]:
# --- 3. NEW: Feature Engineering from add_trans_df (Industry-Level) ---
print("\nStarting feature engineering on transaction data for INDUSTRIES...")
trans_df_dask['f367'] = dd.to_numeric(trans_df_dask['f367'], errors='coerce')

# Group by industry (id8) and aggregate transaction behavior
industry_trans_features_dask = trans_df_dask.groupby('id8').agg(
    industry_avg_spend=('f367', 'mean'),
    industry_total_transactions=('f367', 'count'),
    industry_unique_products=('f368', dd.Aggregation('nunique', chunk=lambda s: s.nunique(), agg=lambda s: s.nunique()))
)
print("Computing aggregated industry features...")
industry_trans_features = industry_trans_features_dask.compute().reset_index()
print("Industry transaction features created.")


Starting feature engineering on transaction data for INDUSTRIES...
Computing aggregated industry features...
Industry transaction features created.


In [8]:
# --- 4. Feature Engineering from offer_metadata_df (Pandas) ---
print("\nStarting feature engineering on offer metadata...")
offer_df['id12'] = pd.to_datetime(offer_df['id12'], errors='coerce')
offer_df['id13'] = pd.to_datetime(offer_df['id13'], errors='coerce')
offer_df['offer_duration_days'] = (offer_df['id13'] - offer_df['id12']).dt.days

offer_meta_features = offer_df[['id3', 'f375', 'f376', 'id10', 'id8', 'offer_duration_days']].rename(columns={
    'f375': 'offer_redemption_freq',
    'f376': 'offer_discount_rate',
    'id10': 'offer_type_code' # Renamed to avoid confusion with industry
})
# Ensure the merge keys are the correct type
offer_meta_features['id3'] = offer_meta_features['id3'].astype(str)
offer_meta_features['id8'] = offer_meta_features['id8'].astype(str)
industry_trans_features['id8'] = industry_trans_features['id8'].astype(str)

# --- NEW: Merge industry features into offer metadata ---
offer_meta_features = pd.merge(offer_meta_features, industry_trans_features, on='id8', how='left')
print("Offer metadata enriched with industry transaction data.")


Starting feature engineering on offer metadata...
Offer metadata enriched with industry transaction data.


In [9]:
# --- 5. Merge All New Features into Main DataFrames ---
print("\nMerging all new features into training and test sets...")
def enrich_dataframe(df):
    """Merges all engineered features into a given dataframe."""
    df['id3'] = df['id3'].astype(str)

    # Now merging small, pre-computed pandas DataFrames
    df = pd.merge(df, offer_event_features, on='id3', how='left')
    df = pd.merge(df, offer_meta_features, on='id3', how='left')

    return df

train_enriched = enrich_dataframe(stratified_train_df)
test_enriched = enrich_dataframe(test_df)

print(f"Enriched training data shape: {train_enriched.shape}")
print(f"Enriched test data shape: {test_enriched.shape}")


# --- 6. Save the Enriched Datasets ---
print("\nSaving enriched datasets to Parquet files...")
try:
    train_enriched.to_parquet('/content/drive/MyDrive/train_enriched.parquet')
    test_enriched.to_parquet('/content/drive/MyDrive/test_enriched.parquet')
    print("Successfully saved enriched data.")
except Exception as e:
    print(f"Error saving enriched data: {e}")


Merging all new features into training and test sets...
Enriched training data shape: (110362, 383)
Enriched test data shape: (369301, 382)

Saving enriched datasets to Parquet files...
Successfully saved enriched data.


In [10]:
import pandas as pd
import numpy as np
import warnings
import gc

warnings.filterwarnings('ignore')

# --- 1. Load the Enriched Training Data ---
try:
    df_train = pd.read_parquet('/content/drive/MyDrive/train_enriched.parquet')
    print("Successfully loaded train_enriched.parquet.")
    print(f"Original training data shape: {df_train.shape}")
except FileNotFoundError:
    print("Error: train_enriched.parquet not found.")
    print("Please ensure you have run the feature engineering script successfully.")
    exit()

# --- 2. Define Initial Feature Set ---
# Identify all potential feature columns
feature_cols = [col for col in df_train.columns if col not in ['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'id8']]

# Convert all feature columns to numeric to allow for calculations
for col in feature_cols:
    if col in df_train.columns:
        df_train[col] = pd.to_numeric(df_train[col], errors='coerce')

# --- 3. Preprocessing on Training Data ---

# --- Step 3a: Drop Columns with High Missing Values ---
print("\n--- Preprocessing Step 1: Dropping Sparse Columns from Training Data ---")
missing_threshold = 0.95 # Drop columns with > 95% missing values
missing_fractions = df_train[feature_cols].isnull().mean()
cols_to_drop_missing = missing_fractions[missing_fractions > missing_threshold].index.tolist()
df_train.drop(columns=cols_to_drop_missing, inplace=True)
# Update our list of feature columns
feature_cols = [col for col in feature_cols if col not in cols_to_drop_missing]
print(f"Dropped {len(cols_to_drop_missing)} columns with more than {missing_threshold*100}% missing values.")

# --- Step 3b: Drop Highly Correlated Features ---
print("\n--- Preprocessing Step 2: Dropping Correlated Features from Training Data ---")
# First, fill NaNs to calculate correlation. We'll use the median.
df_for_corr = df_train[feature_cols].fillna(df_train[feature_cols].median())
corr_matrix = df_for_corr.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
corr_threshold = 0.95 # Drop one feature from any pair with correlation > 0.95
cols_to_drop_corr = [column for column in upper.columns if any(upper[column] > corr_threshold)]
df_train.drop(columns=cols_to_drop_corr, inplace=True)
# Update our list of feature columns one last time
feature_cols = [col for col in feature_cols if col not in cols_to_drop_corr]
print(f"Dropped {len(cols_to_drop_corr)} highly correlated features.")
del df_for_corr, corr_matrix, upper # Clean up memory
gc.collect()

# --- 4. Save the Refined Training Data ---
# The list of columns to keep is implicitly defined by the columns remaining in df_train
final_columns_to_keep = df_train.columns.tolist()
print(f"\nTotal features remaining after preprocessing: {len(feature_cols)}")

try:
    df_train.to_parquet('/content/drive/MyDrive/train_enriched_refined.parquet')
    print("Successfully saved train_enriched_refined.parquet.")
except Exception as e:
    print(f"Error saving refined training data: {e}")


# --- 5. Apply the SAME Preprocessing to the Test Data ---
print("\n--- Applying Preprocessing to Test Data ---")
try:
    df_test = pd.read_parquet('/content/drive/MyDrive/test_enriched.parquet')
    print("Successfully loaded test_enriched.parquet.")
    print(f"Original test data shape: {df_test.shape}")
except FileNotFoundError:
    print("Error: test_enriched.parquet not found.")
    exit()

# The list of columns to keep is defined by the refined training set.
# This ensures perfect consistency between train and test.
# We need to handle the case where a dummy column might not appear in the test set.
final_columns_in_test = [col for col in final_columns_to_keep if col in df_test.columns]
df_test_refined = df_test[final_columns_in_test]

# Add any columns that were in the refined training set but not the test set
# (e.g., a rare category that only appeared in training) and fill with 0.
for col in final_columns_to_keep:
    if col not in df_test_refined.columns:
        df_test_refined[col] = 0

# Ensure the column order is identical
df_test_refined = df_test_refined[final_columns_to_keep]

print(f"Refined test data shape: {df_test_refined.shape}")


# --- 6. Save the Refined Test Data ---
try:
    df_test_refined.to_parquet('/content/drive/MyDrive/test_enriched_refined.parquet')
    print("Successfully saved test_enriched_refined.parquet.")
except Exception as e:
    print(f"Error saving refined test data: {e}")

Successfully loaded train_enriched.parquet.
Original training data shape: (110362, 383)

--- Preprocessing Step 1: Dropping Sparse Columns from Training Data ---
Dropped 41 columns with more than 95.0% missing values.

--- Preprocessing Step 2: Dropping Correlated Features from Training Data ---
Dropped 23 highly correlated features.

Total features remaining after preprocessing: 312
Successfully saved train_enriched_refined.parquet.

--- Applying Preprocessing to Test Data ---
Successfully loaded test_enriched.parquet.
Original test data shape: (369301, 382)
Refined test data shape: (369301, 319)
Successfully saved test_enriched_refined.parquet.


In [4]:
# prompt: import this and list the columns present in this /content/drive/MyDrive/test_enriched_refined.parquet

test_refined_df = pd.read_parquet('/content/drive/MyDrive/test_enriched_refined.parquet')
print("Columns in /content/drive/MyDrive/test_enriched_refined.parquet:")
print(test_refined_df.columns.tolist())

Columns in /content/drive/MyDrive/test_enriched_refined.parquet:
['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f35', 'f38', 'f39', 'f40', 'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f51', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f65', 'f68', 'f69', 'f71', 'f72', 'f73', 'f74', 'f75', 'f77', 'f78', 'f79', 'f81', 'f82', 'f83', 'f85', 'f86', 'f87', 'f89', 'f90', 'f91', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f121', 'f123', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f150', 'f151', 'f152', 'f153', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f16

In [5]:
# prompt: print nunique values in id8 column in test_enriched_refined

print(f"Number of unique values in 'id8' in test_enriched_refined: {test_refined_df['id8'].nunique()}")

Number of unique values in 'id8' in test_enriched_refined: 183


In [3]:
# prompt: print all the columns inside train_enriched_refined

print(df_test_refined.columns.tolist())


NameError: name 'df_test_refined' is not defined

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
import warnings
import gc

warnings.filterwarnings('ignore')

# --- 1. Load the Refined Training Data ---
try:
    df = pd.read_parquet('/content/drive/MyDrive/train_enriched_refined.parquet')
    print("Successfully loaded train_enriched_refined.parquet.")
    print(f"Refined training data shape: {df.shape}")
except FileNotFoundError:
    print("Error: train_enriched_refined.parquet not found.")
    print("Please ensure you have run the preprocessing script successfully.")
    exit()


# --- 2. Final Data Preparation ---

# Identify the target column
target_col = 'y'

# Identify all feature columns (all columns except identifiers and the target)
feature_cols = [col for col in df.columns if col not in ['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'id8']]

# Handle the 'offer_type_code' categorical feature using one-hot encoding
# This is a safe way to handle it, even though LightGBM can use categoricals directly.
if 'offer_type_code' in df.columns:
    df = pd.get_dummies(df, columns=['offer_type_code'], dummy_na=True, prefix='offer_type')
    # Update feature_cols to include the new dummy columns
    if 'offer_type_code' in feature_cols:
        feature_cols.remove('offer_type_code')
    dummy_cols = [col for col in df.columns if col.startswith('offer_type_')]
    feature_cols.extend(dummy_cols)

# Final check and imputation for any remaining missing values
# Filling with a value like -999 allows the model to treat "missing" as a special category.
df[feature_cols] = df[feature_cols].fillna(-999)

# Define features (X), target (y), and groups for cross-validation
X = df[feature_cols]
y = df[target_col].astype(int)
groups = df['id2']

print(f"\nPrepared data for training with {len(feature_cols)} features.")


# --- 3. Training with GroupKFold Cross-Validation ---

# Define LightGBM parameters
# These are robust starting parameters that balance speed and accuracy.
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'num_leaves': 40,
    'max_depth': -1,
    'seed': 42,
    'n_jobs': -1,
    'verbose': -1,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
}

# Set up GroupKFold
N_SPLITS = 5
gkf = GroupKFold(n_splits=N_SPLITS)

# Initialize arrays to store predictions and feature importances
oof_predictions = np.zeros(len(df))
feature_importances = pd.DataFrame(index=feature_cols)

print(f"\nStarting training with {N_SPLITS}-Fold GroupKFold Cross-Validation...")

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")

    # Split data into training and validation sets for this fold
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    # Define and train the model
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    # Make predictions on the validation set
    val_preds = model.predict_proba(X_val)[:, 1]

    # Store the out-of-fold (OOF) predictions
    oof_predictions[val_idx] = val_preds

    # Store feature importances and save the model for this fold
    feature_importances[f'fold_{fold+1}'] = model.feature_importances_
    model_path = f'/content/drive/MyDrive/lgbm_model_fold_{fold+1}.txt'
    model.booster_.save_model(model_path)
    print(f"Model for fold {fold+1} saved to {model_path}")

    # Clean up memory
    del X_train, y_train, X_val, y_val, model
    gc.collect()

# --- 4. Evaluate Overall Performance ---
overall_auc = roc_auc_score(y, oof_predictions)

# --- NEW: Generate Classification Metrics ---
# Convert OOF probabilities to binary predictions using a 0.5 threshold
oof_binary_preds = (oof_predictions > 0.5).astype(int)
overall_accuracy = accuracy_score(y, oof_binary_preds)

print(f"\n--- Overall Cross-Validation Results ---")
print(f"Overall Out-of-Fold (OOF) AUC Score: {overall_auc:.5f}")
print(f"Overall Out-of-Fold (OOF) Accuracy: {overall_accuracy:.5f}")
print("\nOverall Out-of-Fold (OOF) Classification Report:")
print(classification_report(y, oof_binary_preds))


# Display top 20 most important features
feature_importances['mean'] = feature_importances.mean(axis=1)
print("\nTop 20 Most Important Features (averaged across folds):")
print(feature_importances.sort_values('mean', ascending=False).head(20))

Successfully loaded train_enriched_refined.parquet.
Refined training data shape: (110362, 319)

Prepared data for training with 314 features.

Starting training with 5-Fold GroupKFold Cross-Validation...
--- Fold 1/5 ---
Model for fold 1 saved to /content/drive/MyDrive/lgbm_model_fold_1.txt
--- Fold 2/5 ---
Model for fold 2 saved to /content/drive/MyDrive/lgbm_model_fold_2.txt
--- Fold 3/5 ---
Model for fold 3 saved to /content/drive/MyDrive/lgbm_model_fold_3.txt
--- Fold 4/5 ---
Model for fold 4 saved to /content/drive/MyDrive/lgbm_model_fold_4.txt
--- Fold 5/5 ---
Model for fold 5 saved to /content/drive/MyDrive/lgbm_model_fold_5.txt

--- Overall Cross-Validation Results ---
Overall Out-of-Fold (OOF) AUC Score: 0.91820
Overall Out-of-Fold (OOF) Accuracy: 0.86947

Overall Out-of-Fold (OOF) Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     73311
           1       0.87      0.71      0.79     37051

    accurac

In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import gc

warnings.filterwarnings('ignore')

# --- 1. Load the Refined Test Data ---
try:
    df_test = pd.read_parquet('/content/drive/MyDrive/test_enriched_refined.parquet')
    print("Successfully loaded test_enriched_refined.parquet.")
    print(f"Refined test data shape: {df_test.shape}")
except FileNotFoundError:
    print("Error: test_enriched_refined.parquet not found.")
    print("Please ensure you have run the preprocessing script successfully.")
    exit()

# --- 2. Final Data Preparation for Test Set ---

# Keep a copy of the identifier columns for the final submission file
submission_ids = df_test[['id1', 'id2', 'id3', 'id5']].copy()

# Identify all feature columns (all columns except identifiers and the target)
# Note: 'y' is the target, and 'id8' is an identifier we've already used for feature engineering.
feature_cols = [col for col in df_test.columns if col not in ['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'id8']]

# Handle the 'offer_type_code' categorical feature using one-hot encoding
if 'offer_type_code' in df_test.columns:
    df_test = pd.get_dummies(df_test, columns=['offer_type_code'], dummy_na=True, prefix='offer_type')
    # Update feature_cols to include the new dummy columns
    if 'offer_type_code' in feature_cols:
        feature_cols.remove('offer_type_code')
    dummy_cols = [col for col in df_test.columns if col.startswith('offer_type_')]
    feature_cols.extend(dummy_cols)

# MODIFIED: Explicitly convert all feature columns to numeric types.
# This is the key fix to prevent the ValueError.
print("\nConverting all feature columns to numeric types...")
for col in feature_cols:
    if col in df_test.columns:
        df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

# Final check and imputation for any remaining missing values
df_test[feature_cols] = df_test[feature_cols].fillna(-999)

# Define the final feature set for prediction
X_test = df_test[feature_cols]

print(f"\nPrepared test data for prediction with {len(feature_cols)} features.")


# --- 3. Load Models and Generate Predictions ---

N_SPLITS = 5
test_predictions = np.zeros(len(df_test))

print(f"\nLoading {N_SPLITS} models and generating predictions...")

for fold in range(1, N_SPLITS + 1):
    print(f"--- Predicting with Fold {fold}/{N_SPLITS} ---")
    try:
        # Load the model saved from the training script
        model_path = f'/content/drive/MyDrive/lgbm_model_fold_{fold}.txt'
        model = lgb.Booster(model_file=model_path)

        # Predict probabilities on the test set
        fold_preds = model.predict(X_test)

        # Add the predictions for this fold to our total
        # We divide by N_SPLITS here to average them as we go
        test_predictions += fold_preds / N_SPLITS

    except lgb.basic.LightGBMError as e:
        print(f"Error loading model for fold {fold}: {e}")
        print("Please ensure all model files were saved correctly during training.")
        continue

print("All predictions generated and averaged.")


# --- 4. Create and Save the Submission File ---
print("\nCreating the final submission file...")

# Create the submission DataFrame
submission_df = submission_ids.copy()

# Format the 'id5' date column to mm-dd-yyyy as requested
submission_df['id5'] = pd.to_datetime(submission_df['id5'], errors='coerce').dt.strftime('%m-%d-%Y')

# Add the final averaged predictions
# Clean predictions to ensure no NaNs and format to prevent scientific notation
cleaned_predictions = np.nan_to_num(test_predictions, nan=0.0)
submission_df['pred'] = [f"{p:.10f}" for p in cleaned_predictions]


print("\nFirst 5 rows of the final submission file:")
print(submission_df.head())

# Define the output path
submission_path = '/content/drive/MyDrive/r2_submission_best_lightgbm.csv'

try:
    submission_df.to_csv(submission_path, index=False)
    print(f"\nSubmission file successfully saved to: {submission_path}")
except Exception as e:
    print(f"\nError saving submission file: {e}")

Successfully loaded test_enriched_refined.parquet.
Refined test data shape: (369301, 319)

Converting all feature columns to numeric types...

Prepared test data for prediction with 314 features.

Loading 5 models and generating predictions...
--- Predicting with Fold 1/5 ---
--- Predicting with Fold 2/5 ---
--- Predicting with Fold 3/5 ---
--- Predicting with Fold 4/5 ---
--- Predicting with Fold 5/5 ---
All predictions generated and averaged.

Creating the final submission file...

First 5 rows of the final submission file:
                                               id1      id2     id3  \
0   1362907_91950_16-23_2023-11-04 18:56:26.000794  1362907   91950   
1      1082599_88356_16-23_2023-11-04 06:08:53.373  1082599   88356   
2  1888466_958700_16-23_2023-11-05 10:07:28.000725  1888466  958700   
3     1888971_795739_16-23_2023-11-04 12:25:28.244  1888971  795739   
4      1256369_82296_16-23_2023-11-05 06:45:26.657  1256369   82296   

          id5          pred  
0  11-04-20

In [None]:
# prompt: now i want to remove all the features/columns from train_enriched and test_enriched whose missing value is greater than 80% in the training dataset. also print their names. don't remove these columns
# - customer_total_impressions
# - customer_total_clicks
# - customer_unique_offers_seen
# - customer_historical_ctr
# - customer_total_spend
# - customer_avg_spend
# - customer_transaction_count
# - customer_unique_products_purchased
# again reload the dataset from drive and do this operation

# Reload enriched data
train_enriched = pd.read_parquet('/content/drive/MyDrive/train_enriched.parquet')
test_enriched = pd.read_parquet('/content/drive/MyDrive/test_enriched.parquet')

# Define columns to keep
columns_to_keep = [
    'customer_total_impressions',
    'customer_total_clicks',
    'customer_unique_offers_seen',
    'customer_historical_ctr',
    'customer_total_spend',
    'customer_avg_spend',
    'customer_transaction_count',
    'customer_unique_products_purchased',
    'id2', # Assuming 'id2' is the customer ID and should be kept
    'id3', # Assuming 'id3' is the offer ID and should be kept
    'f377', # Assuming this is the target variable based on previous context
]

# Calculate missing value percentage for each column in the training data
missing_percentage = train_enriched.isnull().sum() / len(train_enriched) * 100

# Identify columns with missing percentage > 80%
cols_to_drop = missing_percentage[missing_percentage > 85].index.tolist()

# Filter out columns from the drop list that are in columns_to_keep
cols_to_drop = [col for col in cols_to_drop if col not in columns_to_keep]

# Print the names of columns to be removed
print("Columns to be removed due to >80% missing values in training data:")
for col in cols_to_drop:
    print(col)

# Remove the identified columns from both training and test datasets
train_enriched = train_enriched.drop(columns=cols_to_drop)
test_enriched = test_enriched.drop(columns=cols_to_drop)

print(f"\nTrain enriched shape after removing columns: {train_enriched.shape}")
print(f"Test enriched shape after removing columns: {test_enriched.shape}")

Columns to be removed due to >80% missing values in training data:
f3
f4
f13
f14
f15
f16
f17
f18
f19
f20
f21
f33
f34
f36
f37
f64
f66
f70
f79
f80
f81
f84
f88
f92
f112
f114
f117
f118
f120
f121
f122
f135
f136
f154
f176
f189
f205
f220
f221
f360

Train enriched shape after removing columns: (110362, 347)
Test enriched shape after removing columns: (369301, 346)


In [None]:
train_enriched.shape

(110362, 347)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import warnings

warnings.filterwarnings('ignore')

df = train_enriched

# --- 2. Data Preparation and Memory Reduction ---

# Identify the target column
target_col = 'y'

# Identify all feature columns: original 'f' columns + new engineered columns
# Exclude identifier columns and the target variable
original_f_cols = [col for col in df.columns if col.startswith('f')]
new_feature_cols = [
    'customer_total_impressions', 'customer_total_clicks', 'customer_unique_offers_seen',
    'customer_historical_ctr', 'offer_total_impressions', 'offer_total_clicks',
    'offer_historical_ctr', 'customer_total_spend', 'customer_avg_spend',
    'customer_transaction_count', 'customer_unique_products_purchased',
    'offer_redemption_freq', 'offer_discount_rate', 'offer_industry_code',
    'offer_duration_days'
]
feature_cols = original_f_cols + new_feature_cols

# Handle categorical features (e.g., offer_industry_code) using one-hot encoding
# This converts categorical columns into separate binary (0/1) columns
df = pd.get_dummies(df, columns=['offer_industry_code'], dummy_na=True)

# Update feature_cols to include the new dummy columns and remove the original
feature_cols.remove('offer_industry_code')
dummy_cols = [col for col in df.columns if col.startswith('offer_industry_code_')]
feature_cols.extend(dummy_cols)

# Convert all feature columns to numeric, coercing errors
for col in feature_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Simple imputation: fill any remaining missing values with 0
# This is a safe baseline. More advanced imputation could be explored later.
df[feature_cols] = df[feature_cols].fillna(0)

# Memory Reduction Step
print(f"\nOriginal memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].astype(np.float32)
for col in df.select_dtypes(include=['int64']).columns:
    df[col] = df[col].astype(np.int32)
print(f"Reduced memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Define features (X) and target (y)
X = df[feature_cols]
y = df[target_col].astype(int)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")


# --- 3. Hyperparameter Tuning with RandomizedSearchCV ---

# Calculate scale_pos_weight for XGBoost
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"\nCalculated scale_pos_weight: {scale_pos_weight:.2f}")

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [200, 300, 500],
    'max_depth': [5, 7, 9, 11],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

# Instantiate the XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    random_state=42
)

# Set up RandomizedSearchCV with memory-safe parameters
# MODIFIED: cv is now 3 as requested.
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=15,
    scoring='roc_auc',
    n_jobs=1,  # Use 1 core to be memory-safe
    cv=3,      # 3-fold cross-validation
    verbose=2,
    random_state=42
)

print("\nStarting RandomizedSearch on enriched data...")
random_search.fit(X_train, y_train)

print("\nSearch complete.")
print(f"Best ROC AUC score from search: {random_search.best_score_:.4f}")
print("Best parameters found:")
print(random_search.best_params_)


# --- 4. Evaluate the Best Model on the Validation Set ---

best_clf = random_search.best_estimator_
y_pred_proba = best_clf.predict_proba(X_val)[:, 1]
y_pred = best_clf.predict(X_val)

print("\n--- Validation Set Performance on Enriched Data ---")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))


Original memory usage: 360.75 MB
Reduced memory usage: 182.88 MB

Training set size: (88289, 343)
Validation set size: (22073, 343)

Calculated scale_pos_weight: 1.98

Starting RandomizedSearch on enriched data...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 2.0min
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 2.0min
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 2.0min
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.7; total time= 2.2min
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.7; total ti

In [None]:
model_path = '/content/drive/MyDrive/xgb_best_model.json'
print(f"\nSaving the best model to: {model_path}")
try:
    best_clf.save_model(model_path)
    print("Model saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")


Saving the best model to: /content/drive/MyDrive/xgb_best_model.json
Model saved successfully.


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask.dataframe as dd
import warnings

warnings.filterwarnings('ignore')

# --- 1. Load the Saved Model ---
print("Loading the saved XGBoost model...")
model_path = '/content/drive/MyDrive/xgb_best_model.json'
try:
    # Instantiate a new classifier and load the saved model into it
    best_clf = xgb.XGBClassifier()
    best_clf.load_model(model_path)
    print("Model loaded successfully.")
except Exception as e:
    print(f"\nError loading model: {e}")
    print("Please ensure the model was saved correctly from the training script.")
    exit()


# --- 2. Load and Prepare Test Data with Dask ---
print("\nLoading and preparing test data using Dask...")
try:
    # Use Dask to read the large enriched test file lazily
    test_dd = dd.read_parquet('/content/drive/MyDrive/test_enriched.parquet')
    print("Successfully loaded test_enriched.parquet as a Dask DataFrame.")
except FileNotFoundError:
    print("Error: test_enriched.parquet not found.")
    exit()

# Keep a copy of the identifier columns for the final submission file
# Reading just a few columns with pandas is memory-safe
submission_ids = pd.read_parquet('/content/drive/MyDrive/test_enriched.parquet', columns=['id1', 'id2', 'id3', 'id5'])

# --- Apply the same column dropping logic as training data ---
cols_to_drop = [
    'f3', 'f4', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21',
    'f33', 'f34', 'f36', 'f37', 'f64', 'f66', 'f70', 'f79', 'f80', 'f81', 'f84',
    'f88', 'f92', 'f112', 'f114', 'f117', 'f118', 'f120', 'f121', 'f122',
    'f135', 'f136', 'f154', 'f176', 'f189', 'f205', 'f220', 'f221', 'f360'
]

# Drop the identified columns from the test Dask DataFrame
print(f"Dropping {len(cols_to_drop)} columns from test data...")
# Ensure columns exist before dropping
cols_to_drop_exist = [col for col in cols_to_drop if col in test_dd.columns]
test_dd = test_dd.drop(columns=cols_to_drop_exist)
print(f"Test data shape after dropping columns: ({test_dd.shape[0].compute()}, {test_dd.shape[1]})")


# --- Replicate Preprocessing Steps using Dask ---
# MODIFIED: Explicitly convert the column to 'category' dtype before one-hot encoding.
test_dd['offer_industry_code'] = test_dd['offer_industry_code'].astype('category')

# Add this step to make categories known to Dask and align with training categories
# We need the categories from the training data to ensure consistency
try:
    train_enriched_for_col_check = pd.read_parquet('/content/drive/MyDrive/train_enriched.parquet', columns=['offer_industry_code'])
    train_categories = train_enriched_for_col_check['offer_industry_code'].astype('category').cat.categories
    # Use set_categories with rename=False to ensure all training categories are present
    test_dd['offer_industry_code'] = test_dd['offer_industry_code'].cat.set_categories(train_categories, rename=False).cat.as_known()
    print("Offer industry code categories aligned.")
except FileNotFoundError:
     print("Error: train_enriched.parquet not found. Cannot align offer industry code categories.")
     # If training data is not found, proceed without aligning categories, but this might lead to errors if test data has unseen categories.
     test_dd['offer_industry_code'] = test_dd['offer_industry_code'].cat.as_known() # Still make known whatever categories are present in test data.


# Handle categorical features using Dask's get_dummies
test_dd = dd.get_dummies(test_dd, columns=['offer_industry_code'], dummy_na=True)


# Align columns with the training set
# Get training columns from the loaded model's feature names
try:
    training_columns = best_clf.get_booster().feature_names
    print(f"Expected training columns from model: {len(training_columns)}")
except Exception as e:
    print(f"Error getting feature names from model: {e}")
    # Fallback: try to load training data and process to get column names
    try:
        train_processed_for_cols = pd.read_parquet('/content/drive/MyDrive/train_enriched.parquet')
        train_processed_for_cols = train_processed_for_cols.drop(columns=cols_to_drop_exist)
        train_processed_for_cols['offer_industry_code'] = train_processed_for_cols['offer_industry_code'].astype('category').cat.as_known()
        train_processed_for_cols = pd.get_dummies(train_processed_for_cols, columns=['offer_industry_code'], dummy_na=True)
        training_columns = train_processed_for_cols.columns.tolist()
        if 'f377' in training_columns:
            training_columns.remove('f377')
        print(f"Expected training columns from processing training data: {len(training_columns)}")
    except FileNotFoundError:
        print("Error: train_enriched.parquet not found. Cannot determine training columns.")
        exit()


# Align columns: Add missing training columns to test_dd with fill_value=0
# and then select only the training columns in the correct order.
test_columns = test_dd.columns
missing_cols = set(training_columns) - set(test_columns)

# Create dummy columns for missing ones in test_dd
for col in missing_cols:
    test_dd[col] = 0

# Select columns in the training order
test_dd = test_dd[training_columns]

# Simple imputation: fill any remaining missing values with 0
test_dd = test_dd.fillna(0)

print("Test data preprocessing plan created.")


# --- 3. Generate Predictions in Chunks ---
print("\nGenerating predictions on the test data (in chunks)...")
# Dask will automatically handle predicting on the data in chunks
# to avoid loading it all into memory.
test_predictions_proba_dask = test_dd.map_partitions(
    best_clf.predict_proba,
    meta=pd.DataFrame(columns=[0, 1], dtype=np.float32)
)

# We only need the probability of the positive class (1)
positive_class_proba_dask = test_predictions_proba_dask[1]

# Execute the computation to get the predictions as a pandas Series
print("Computing predictions...")
test_predictions_proba = positive_class_proba_dask.compute()
print("Predictions generated successfully.")


# --- 4. Create and Save the Submission File ---
print("\nCreating the final submission file...")

# Create the submission DataFrame
submission_df = submission_ids.copy()

# Format the 'id5' date column to mmddyy
submission_df['id5'] = pd.to_datetime(submission_df['id5'], errors='coerce').dt.strftime('%m%d%y')

# Add the predictions
submission_df['pred'] = test_predictions_proba

print("\nFirst 5 rows of the final submission file:")
print(submission_df.head())

# Define the output path
submission_path = '/content/drive/MyDrive/r2_submission_file_myteam111.csv'

try:
    submission_df.to_csv(submission_path, index=False)
    print(f"\nSubmission file successfully saved to: {submission_path}")
except Exception as e:
    print(f"\nError saving submission file: {e}")

Loading the saved XGBoost model...
Model loaded successfully.

Loading and preparing test data using Dask...
Successfully loaded test_enriched.parquet as a Dask DataFrame.
Dropping 40 columns from test data...
Test data shape after dropping columns: (369301, 346)
Offer industry code categories aligned.
Expected training columns from model: 343
Test data preprocessing plan created.

Generating predictions on the test data (in chunks)...
Computing predictions...


TypeError: Invalid value '0' for dtype string

In [None]:
# prompt: find number of missing values column wise in /content/drive/MyDrive/r2_submission_file_myteam11.csv after imprting it from drive
import pandas as pd
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/r2_submission_file Team Pioneers.csv')

# Calculate the number of missing values per column
# missing_values_count = df.isnull().sum()

# Print the column-wise missing value count
# print("Number of missing values per column:")
# missing_values_count

  df = pd.read_csv('/content/drive/MyDrive/r2_submission_file Team Pioneers.csv')


In [None]:
# prompt: print the data type of first 5 columns in df

print("Data types of the first 5 columns:")
print(df.iloc[:, :5].dtypes)

Data types of the first 5 columns:
id1     object
id2     object
id3      int64
id5     object
pred    object
dtype: object


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369301 entries, 0 to 369300
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id1     369301 non-null  object
 1   id2     369301 non-null  object
 2   id3     369301 non-null  int64 
 3   id5     369301 non-null  object
 4   pred    369301 non-null  object
dtypes: int64(1), object(4)
memory usage: 14.1+ MB


In [None]:
import pandas as pd

# Path to the file you want to check
submission_path = '/content/drive/MyDrive/r2_submission_file Team Pioneers.csv'

print(f"Checking file: {submission_path}")

try:
    # Load the submission file
    df = pd.read_csv(submission_path)

    # Check the data type of the 'pred' column as pandas reads it
    print(f"\nPandas inferred dtype for 'pred' column: {df['pred'].dtype}")

    problem_found = False
    # Iterate through each value and try to convert it to a float
    for index, value in df['pred'].items():
        try:
            # This is the core check
            float(value)
        except (ValueError, TypeError):
            print(f"--> Problem found at row index {index}: value='{value}'")
            problem_found = True

    if not problem_found:
        print("\nSuccess! All values in the 'pred' column can be read as numbers.")
        print("The issue is likely with the submission platform's parser.")

except FileNotFoundError:
    print(f"\nError: Could not find the file at {submission_path}")

Checking file: /content/drive/MyDrive/r2_submission_file Team Pioneers.csv


  df = pd.read_csv(submission_path)



Pandas inferred dtype for 'pred' column: object
--> Problem found at row index 41806: value='0.011-04-202335'
--> Problem found at row index 70886: value='0.11-04-202304'
--> Problem found at row index 90939: value='0.011-04-202371'
--> Problem found at row index 105756: value='0.00811-04-2023'
--> Problem found at row index 187554: value='0.11-05-202305'


In [None]:
print(df['pred'].min())

0.00022452489


In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
import warnings
import gc

warnings.filterwarnings('ignore')

# --- 1. Load the Refined Training Data ---
try:
    df = pd.read_parquet('/content/drive/MyDrive/train_enriched_refined.parquet')
    print("Successfully loaded train_enriched_refined.parquet.")
    print(f"Refined training data shape: {df.shape}")
except FileNotFoundError:
    print("Error: train_enriched_refined.parquet not found.")
    print("Please ensure you have run the preprocessing script successfully.")
    exit()


# --- 2. Final Data Preparation ---

# Identify the target column
target_col = 'y'

# Identify all feature columns (all columns except identifiers and the target)
feature_cols = [col for col in df.columns if col not in ['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'id8']]

# Handle the 'offer_type_code' categorical feature using one-hot encoding
if 'offer_type_code' in df.columns:
    df = pd.get_dummies(df, columns=['offer_type_code'], dummy_na=True, prefix='offer_type')
    # Update feature_cols to include the new dummy columns
    if 'offer_type_code' in feature_cols:
        feature_cols.remove('offer_type_code')
    dummy_cols = [col for col in df.columns if col.startswith('offer_type_')]
    feature_cols.extend(dummy_cols)

# Final check and imputation for any remaining missing values
df[feature_cols] = df[feature_cols].fillna(-999)

# Define features (X), target (y), and groups
X = df[feature_cols]
y = df[target_col].astype(int)
groups = df['id2']

print(f"\nPrepared data for training with {len(feature_cols)} features.")


# --- 3. Hyperparameter Tuning with RandomizedSearchCV and GroupKFold ---

# Define XGBoost parameters
# We don't need n_estimators here as it will be part of the search grid.
# We also don't need scale_pos_weight yet, as it's better to calculate it on the specific training fold.
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'seed': 42,
    'n_jobs': -1,
}

# Define the parameter grid for the search
param_grid = {
    'learning_rate': [0.02, 0.05, 0.1],
    'n_estimators': [500, 1000, 2000],
    'max_depth': [5, 7, 9],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'min_child_weight': [1, 3, 5]
}

# Instantiate the XGBoost classifier
xgb_clf = xgb.XGBClassifier(**params)

# Set up GroupKFold as the cross-validation strategy
N_SPLITS = 5
gkf = GroupKFold(n_splits=N_SPLITS)

# Set up RandomizedSearchCV
# We pass the GroupKFold object to the 'cv' parameter.
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=10, # Number of parameter settings that are sampled.
    scoring='roc_auc',
    n_jobs=1,  # Use 1 core to be memory-safe
    cv=gkf,    # CRITICAL: Use GroupKFold for cross-validation
    verbose=2,
    random_state=42
)

print(f"\nStarting RandomizedSearch with {N_SPLITS}-Fold GroupKFold...")
# The .fit() method now requires the 'groups' parameter to pass to GroupKFold
random_search.fit(X, y, groups=groups)

print("\nSearch complete.")
print(f"Best cross-validated ROC AUC score from search: {random_search.best_score_:.5f}")
print("Best parameters found:")
print(random_search.best_params_)

# --- 4. Save the Single Best Model ---
best_clf = random_search.best_estimator_
model_path = '/content/drive/MyDrive/xgb_best_model_from_search.json'
print(f"\nSaving the best model to: {model_path}")
best_clf.save_model(model_path)
print("Model saved successfully.")


# --- 5. Final Evaluation on a Hold-Out Set ---
# It's good practice to have a final, unseen validation set to confirm performance.
# We'll do a simple train/val split here for a final check.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
best_clf.fit(X_train, y_train) # Refit on this training portion
y_pred_proba = best_clf.predict_proba(X_val)[:, 1]
y_pred = best_clf.predict(X_val)

print("\n--- Final Performance on a Hold-Out Validation Set ---")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Successfully loaded train_enriched_refined.parquet.
Refined training data shape: (110362, 319)

Prepared data for training with 314 features.

Starting RandomizedSearch with 5-Fold GroupKFold...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 1.6min
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 1.8min
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 1.6min
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 1.6min
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=500, subsample=0.7; total time= 1.6min
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, min_ch

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask.dataframe as dd
import warnings
import gc

warnings.filterwarnings('ignore')

# --- 1. Load the Saved Best Model ---
print("Loading the saved best XGBoost model...")
model_path = '/content/drive/MyDrive/xgb_best_model_from_search.json'
try:
    # Instantiate a new classifier and load the saved model into it
    best_clf = xgb.XGBClassifier()
    best_clf.load_model(model_path)
    print("Model loaded successfully.")
except Exception as e:
    print(f"\nError loading model: {e}")
    print("Please ensure the model was saved correctly from the tuning script.")
    exit()


# --- 2. Load Test Data Identifiers and Set Up for Prediction ---
print("\nLoading test data identifiers...")
try:
    # We only need the ID columns for the final submission file.
    # The features will be loaded partition by partition.
    submission_ids = pd.read_parquet(
        '/content/drive/MyDrive/test_enriched_refined.parquet',
        columns=['id1', 'id2', 'id3', 'id5']
    )
    # Use Dask to create a reference to the test data for iteration
    test_dd = dd.read_parquet('/content/drive/MyDrive/test_enriched_refined.parquet')
    print("Successfully loaded test data identifiers and created Dask DataFrame.")
except FileNotFoundError:
    print("Error: test_enriched_refined.parquet not found.")
    print("Please ensure you have run the preprocessing script successfully.")
    exit()

# Get the list of feature columns the model was trained on
training_columns = best_clf.get_booster().feature_names


# --- 3. Generate Predictions by Iterating Through Partitions (Memory-Safe) ---
print("\nGenerating predictions by processing data in chunks...")
all_predictions = []

# Iterate over each partition of the Dask DataFrame
for i, partition in enumerate(test_dd.partitions):
    print(f"Processing partition {i+1}/{test_dd.npartitions}...")
    # Compute the current partition to get a small pandas DataFrame
    partition_pd = partition.compute()

    # --- Apply the EXACT SAME preprocessing steps as the training script ---
    # a. Handle categorical features
    if 'offer_type_code' in partition_pd.columns:
        partition_pd = pd.get_dummies(partition_pd, columns=['offer_type_code'], dummy_na=True, prefix='offer_type')

    # b. Align columns with the training set
    partition_pd = partition_pd.reindex(columns=training_columns, fill_value=0)

    # c. Explicitly convert all feature columns to numeric types
    for col in partition_pd.columns:
        if col in training_columns:
            partition_pd[col] = pd.to_numeric(partition_pd[col], errors='coerce')

    # d. Simple imputation: fill any remaining missing values
    partition_pd = partition_pd.fillna(-999)

    # e. Generate predictions for this chunk
    chunk_predictions = best_clf.predict_proba(partition_pd)[:, 1]
    all_predictions.append(chunk_predictions)

# Concatenate the results from all chunks into a single numpy array
test_predictions_proba = np.concatenate(all_predictions)
print("Predictions generated successfully.")


# --- 4. Create and Save the Submission File ---
print("\nCreating the final submission file...")

# Create the submission DataFrame
submission_df = submission_ids.copy()

# Format the 'id5' date column to mm-dd-yyyy as requested
submission_df['id5'] = pd.to_datetime(submission_df['id5'], errors='coerce').dt.strftime('%m-%d-%Y')

# Add the final predictions
# Clean predictions to ensure no NaNs and format to prevent scientific notation
cleaned_predictions = np.nan_to_num(test_predictions_proba, nan=0.0)
submission_df['pred'] = [f"{p:.10f}" for p in cleaned_predictions]

print("\nFirst 5 rows of the final submission file:")
print(submission_df.head())

# Define the output path
submission_path = '/content/drive/MyDrive/r2_submission_file_xgb_gkf_ran.csv'

try:
    submission_df.to_csv(submission_path, index=False)
    print(f"\nSubmission file successfully saved to: {submission_path}")
except Exception as e:
    print(f"\nError saving submission file: {e}")

Loading the saved best XGBoost model...
Model loaded successfully.

Loading test data identifiers...
Successfully loaded test data identifiers and created Dask DataFrame.

Generating predictions by processing data in chunks...
Processing partition 1/1...
Predictions generated successfully.

Creating the final submission file...

First 5 rows of the final submission file:
                                               id1      id2     id3  \
0   1362907_91950_16-23_2023-11-04 18:56:26.000794  1362907   91950   
1      1082599_88356_16-23_2023-11-04 06:08:53.373  1082599   88356   
2  1888466_958700_16-23_2023-11-05 10:07:28.000725  1888466  958700   
3     1888971_795739_16-23_2023-11-04 12:25:28.244  1888971  795739   
4      1256369_82296_16-23_2023-11-05 06:45:26.657  1256369   82296   

          id5          pred  
0  11-04-2023  0.0553781241  
1  11-04-2023  0.1124634817  
2  11-05-2023  0.9906606078  
3  11-04-2023  0.0513244905  
4  11-05-2023  0.0308717992  

Submission file su