In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amexkkgp/test_data.parquet
/kaggle/input/amexkkgp/add_event.parquet
/kaggle/input/amexkkgp/data_dictionary.csv
/kaggle/input/amexkkgp/offer_metadata.parquet
/kaggle/input/amexkkgp/add_trans.parquet
/kaggle/input/amexkkgp/test_enriched.parquet
/kaggle/input/amexkkgp/train_enriched.parquet
/kaggle/input/amexkkgp/train_data.parquet


In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings

warnings.filterwarnings('ignore')

print("Loading datasets...")
# --- 1. Load Datasets (Using Dask for Large Files) ---
# Load smaller files with pandas
stratified_train_df = pd.read_parquet('/kaggle/input/amexkkgp/train_data.parquet')
test_df = pd.read_parquet('/kaggle/input/amexkkgp/test_data.parquet')
offer_df = pd.read_parquet('/kaggle/input/amexkkgp/offer_metadata.parquet')

# Load large event and transaction logs with Dask to prevent memory crashes
try:
    event_df_dask = dd.read_parquet('/kaggle/input/amexkkgp/add_event.parquet')
    trans_df_dask = dd.read_parquet('/kaggle/input/amexkkgp/add_trans.parquet')
    print("All datasets loaded successfully (using Dask for large files).")
except Exception as e:
    print(f"Error loading supplementary data: {e}")
    # Terminate if supplementary data isn't available
    exit()

Loading datasets...
All datasets loaded successfully (using Dask for large files).


In [3]:
# --- 2. Feature Engineering from add_event_df (Offer-Level Only) ---
print("\nStarting feature engineering on event data for OFFERS only...")

# Create a 'clicked' column based on whether id7 is null
event_df_dask['clicked'] = (~event_df_dask['id7'].isnull()).astype(int)

# --- Offer-level event features ---
# Group by offer (id3) and aggregate its historical performance
offer_event_features_dask = event_df_dask.groupby('id3').agg(
    offer_total_impressions=('id4', 'count'),
    offer_total_clicks=('clicked', 'sum')
)
# Calculate offer's historical CTR
offer_event_features_dask['offer_historical_ctr'] = (
    offer_event_features_dask['offer_total_clicks'] / offer_event_features_dask['offer_total_impressions']
).fillna(0)

# --- Execute Dask computation ---
print("Computing aggregated offer features...")
offer_event_features = offer_event_features_dask.compute().reset_index()
print("Offer event features created.")


Starting feature engineering on event data for OFFERS only...
Computing aggregated offer features...
Offer event features created.


In [4]:
# --- 3. NEW: Feature Engineering from add_trans_df (Industry-Level) ---
print("\nStarting feature engineering on transaction data for INDUSTRIES...")
trans_df_dask['f367'] = dd.to_numeric(trans_df_dask['f367'], errors='coerce')

# Group by industry (id8) and aggregate transaction behavior
industry_trans_features_dask = trans_df_dask.groupby('id8').agg(
    industry_avg_spend=('f367', 'mean'),
    industry_total_transactions=('f367', 'count'),
    industry_unique_products=('f368', dd.Aggregation('nunique', chunk=lambda s: s.nunique(), agg=lambda s: s.nunique()))
)
print("Computing aggregated industry features...")
industry_trans_features = industry_trans_features_dask.compute().reset_index()
print("Industry transaction features created.")


Starting feature engineering on transaction data for INDUSTRIES...
Computing aggregated industry features...
Industry transaction features created.


In [5]:
# --- 4. Feature Engineering from offer_metadata_df (Pandas) ---
print("\nStarting feature engineering on offer metadata...")
offer_df['id12'] = pd.to_datetime(offer_df['id12'], errors='coerce')
offer_df['id13'] = pd.to_datetime(offer_df['id13'], errors='coerce')
offer_df['offer_duration_days'] = (offer_df['id13'] - offer_df['id12']).dt.days

offer_meta_features = offer_df[['id3', 'f375', 'f376', 'id10', 'id8', 'offer_duration_days']].rename(columns={
    'f375': 'offer_redemption_freq',
    'f376': 'offer_discount_rate',
    'id10': 'offer_type_code' # Renamed to avoid confusion with industry
})
# Ensure the merge keys are the correct type
offer_meta_features['id3'] = offer_meta_features['id3'].astype(str)
offer_meta_features['id8'] = offer_meta_features['id8'].astype(str)
industry_trans_features['id8'] = industry_trans_features['id8'].astype(str)

# --- NEW: Merge industry features into offer metadata ---
offer_meta_features = pd.merge(offer_meta_features, industry_trans_features, on='id8', how='left')
print("Offer metadata enriched with industry transaction data.")


Starting feature engineering on offer metadata...
Offer metadata enriched with industry transaction data.


In [6]:
# --- 5. Merge All New Features into Main DataFrames ---
print("\nMerging all new features into training and test sets...")
def enrich_dataframe(df):
    """Merges all engineered features into a given dataframe."""
    df['id3'] = df['id3'].astype(str)

    # Now merging small, pre-computed pandas DataFrames
    df = pd.merge(df, offer_event_features, on='id3', how='left')
    df = pd.merge(df, offer_meta_features, on='id3', how='left')

    return df

train_enriched = enrich_dataframe(stratified_train_df)
test_enriched = enrich_dataframe(test_df)

print(f"Enriched training data shape: {train_enriched.shape}")
print(f"Enriched test data shape: {test_enriched.shape}")


# --- 6. Save the Enriched Datasets ---
print("\nSaving enriched datasets to Parquet files...")
try:
    train_enriched.to_parquet('/kaggle/working/train_enriched_v10.parquet')
    test_enriched.to_parquet('/kaggle/working/test_enriched_v10.parquet')
    print("Successfully saved enriched data.")
except Exception as e:
    print(f"Error saving enriched data: {e}")


Merging all new features into training and test sets...
Enriched training data shape: (770164, 383)
Enriched test data shape: (369301, 382)

Saving enriched datasets to Parquet files...
Successfully saved enriched data.


In [7]:
train_enriched.shape

(770164, 383)

In [8]:
train_enriched.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,offer_total_clicks,offer_historical_ctr,offer_redemption_freq,offer_discount_rate,offer_type_code,id8,offer_duration_days,industry_avg_spend,industry_total_transactions,industry_unique_products
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,1092,0.059875,2.0,2.0,1,57310000,29.0,433.638017,37472.0,1.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,966,0.046487,2.0,,1,59210000,181.0,142.77151,24597.0,1.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,759,0.041484,2.0,10.0,1,72310000,29.0,137.033818,19409.0,1.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,771,0.042805,2.0,10.0,1,56510500,29.0,249.131054,57064.0,1.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,784,0.042544,2.0,8.0,1,59991300,29.0,167.946222,14005.0,1.0


In [1]:
import pandas as pd
import numpy as np
import warnings
import gc
import os

# Modeling libraries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD DATA AND DATA DICTIONARY
# =============================================================================
print("--- STAGE 1: Loading Data ---")
try:
    # Load the intelligent sample for fitting the cluster models
    df_train = pd.read_parquet(f'/kaggle/working/train_enriched_v10.parquet')
    print("Successfully loaded all required data.")
    print(f"Training sample shape: {df_train.shape}")
    
    # --- NEW: Print the last 15 column names ---
    print("\nLast 15 columns of the training data:")
    print(df_train.columns[-15:].tolist())
    
    # Load the full, enriched test set to apply the transformations to
    df_test = pd.read_parquet(f'/kaggle/working/test_enriched_v10.parquet')
    print(f"Test data shape: {df_test.shape}")
    # Load the data dictionary
    data_dict = pd.read_csv(f'{INPUT_DIR}data_dictionary.csv')
    
    print("Successfully loaded all required data.")
    print(f"Training sample shape: {df_train.shape}")
    print(f"Test data shape: {df_test.shape}")
except FileNotFoundError:
    print("Error: Required data not found. Please run the sampling and feature engineering scripts first.")
    exit()

# =============================================================================
# STAGE 2: DEFINE THE REUSABLE CLUSTERING FUNCTION
# =============================================================================
print("\n--- STAGE 2: Defining the Clustering Pipeline Function ---")

def create_cluster_features(df_train, df_test, data_dict, keywords, cluster_col_name, n_clusters=5):
    """
    Performs a full PCA + K-Means clustering pipeline on a specified group of features
    and adds the resulting cluster labels to both the training and test dataframes.
    """
    print(f"\n--- Creating cluster feature: '{cluster_col_name}' ---")
    
    # --- a. Select Features ---
    print(f"Selecting features based on keywords: {keywords}...")
    try:
        if keywords == 'all':
            group_features_in_df = [col for col in df_train.columns if col.startswith('f')]
        else:
            feature_name_col = 'masked_column'
            description_col = 'Description'
            group_mask = data_dict[description_col].str.contains('|'.join(keywords), case=False, na=False)
            group_features = data_dict[group_mask][feature_name_col].tolist()
            group_features_in_df = [col for col in group_features if col in df_train.columns]
        
        if not group_features_in_df:
            print(f"Warning: No features found for group '{cluster_col_name}'. Skipping.")
            return df_train, df_test
            
        print(f"Found {len(group_features_in_df)} features for this group.")
        X_train_cluster = df_train[group_features_in_df].copy()
        X_test_cluster = df_test[group_features_in_df].copy()
    except Exception as e:
        print(f"Error selecting features: {e}. Skipping this cluster.")
        return df_train, df_test

    # --- b. Preprocess Data ---
    print("Preprocessing data (impute, scale)...")
    for col in X_train_cluster.columns:
        X_train_cluster[col] = pd.to_numeric(X_train_cluster[col], errors='coerce')
        X_test_cluster[col] = pd.to_numeric(X_test_cluster[col], errors='coerce')

    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    
    # Fit on training data, transform both train and test
    X_train_imputed = imputer.fit_transform(X_train_cluster)
    X_test_imputed = imputer.transform(X_test_cluster)
    
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    # --- c. PCA ---
    print("Applying PCA...")
    pca = PCA(n_components=0.90) # Keep components explaining 90% of variance
    pca.fit(X_train_scaled)
    
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    print(f"PCA resulted in {pca.n_components_} components.")

    # --- d. K-Means Clustering ---
    print("Applying K-Means...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    
    # Fit on training data, predict for both train and test
    df_train[cluster_col_name] = kmeans.fit_predict(X_train_pca)
    df_test[cluster_col_name] = kmeans.predict(X_test_pca)
    
    print(f"Successfully created and added '{cluster_col_name}' feature.")
    
    return df_train, df_test


# =============================================================================
# STAGE 3: DEFINE FEATURE GROUPS AND RUN THE PIPELINE
# =============================================================================
print("\n--- STAGE 3: Running the Clustering Pipeline for Each Feature Group ---")

# --- Define Keyword Groups ---
# Group 1: Customer Spending Behavior
spending_keywords = ['debit amount', 'debit transaction', 'spend in']
df_train, df_test = create_cluster_features(df_train, df_test, data_dict, spending_keywords, 'spending_cluster')

# Group 2: Customer Click/Impression History
history_keywords = ['ctr in last', 'clicks in last', 'impressions in last']
df_train, df_test = create_cluster_features(df_train, df_test, data_dict, history_keywords, 'history_cluster')

# Group 3: Customer Profile & Loyalty
profile_keywords = ['membership level', 'account age', 'interest score', 'miles', 'segments']
df_train, df_test = create_cluster_features(df_train, df_test, data_dict, profile_keywords, 'profile_cluster')

# Group 4: Customer Engagement on Amex Platforms
engagement_keywords = ['time spent', 'pages viewed', 'visited', 'logon']
df_train, df_test = create_cluster_features(df_train, df_test, data_dict, engagement_keywords, 'engagement_cluster')

# Group 5: All 'f' Features (Holistic)
df_train, df_test = create_cluster_features(df_train, df_test, data_dict, 'all', 'holistic_cluster')


# =============================================================================
# STAGE 4: SAVE THE FINAL ENRICHED DATASETS
# =============================================================================
print("\n--- STAGE 4: Saving Final Datasets with New Cluster Features ---")

# --- Verify the new columns were added ---
new_cols = ['spending_cluster', 'history_cluster', 'profile_cluster', 'engagement_cluster', 'holistic_cluster']
print("\nColumns added to the training data:")
print([col for col in new_cols if col in df_train.columns])
print("\nColumns added to the test data:")
print([col for col in new_cols if col in df_test.columns])

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v11.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v11.parquet'
    
    df_train.to_parquet(train_output_path)
    df_test.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- CLUSTER FEATURE ENGINEERING COMPLETE ---")

--- STAGE 1: Loading Data ---
Successfully loaded all required data.
Training sample shape: (770164, 383)

Last 15 columns of the training data:
['f363', 'f364', 'f365', 'f366', 'offer_total_impressions', 'offer_total_clicks', 'offer_historical_ctr', 'offer_redemption_freq', 'offer_discount_rate', 'offer_type_code', 'id8', 'offer_duration_days', 'industry_avg_spend', 'industry_total_transactions', 'industry_unique_products']
Test data shape: (369301, 382)
Successfully loaded all required data.
Training sample shape: (770164, 383)
Test data shape: (369301, 382)

--- STAGE 2: Defining the Clustering Pipeline Function ---

--- STAGE 3: Running the Clustering Pipeline for Each Feature Group ---

--- Creating cluster feature: 'spending_cluster' ---
Selecting features based on keywords: ['debit amount', 'debit transaction', 'spend in']...
Found 47 features for this group.
Preprocessing data (impute, scale)...
Applying PCA...
PCA resulted in 33 components.
Applying K-Means...
Successfully cre

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings
import gc
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD LATEST DATASETS
# =============================================================================
print("--- STAGE 1: Loading Latest Datasets ---")
try:
    # Load your latest training and test sets with cluster features
    df_train = pd.read_parquet(f'{WORKING_DIR}train_data_v11.parquet')
    df_test = pd.read_parquet(f'{WORKING_DIR}test_data_v11.parquet')
    
    print("Successfully loaded all required data.")
    print(f"Input training data shape: {df_train.shape}")
    print(f"Input test data shape: {df_test.shape}")

except FileNotFoundError:
    print("Error: Required data not found. Please ensure all necessary files are in the correct directories.")
    exit()

# =============================================================================
# STAGE 2: CREATE TIME-BASED AND INTERACTION FEATURES
# =============================================================================
print("\n--- STAGE 2: Creating Time-Based and Interaction Features ---")

def create_advanced_features(df):
    """A reusable function to create time-based and interaction features."""
    
    # --- Time-Based Features ---
    df['impression_time'] = pd.to_datetime(df['id4'], errors='coerce')
    df['hour_of_day'] = df['impression_time'].dt.hour
    df['day_of_week'] = df['impression_time'].dt.dayofweek # Monday=0, Sunday=6
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # --- NEW: Cyclical Time Features ---
    # This helps the model understand the cyclical nature of time (e.g., hour 23 is close to hour 0)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day']/24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day']/24.0)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week']/7.0)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week']/7.0)

    # --- NEW: Interaction Features ---
    # We create these based on hypotheses from our EDA.
    # For example, is a certain offer type more effective on the weekend?
    if 'offer_type_code' in df.columns:
        df['offer_type_x_weekend'] = df['offer_type_code'].astype(str) + "_" + df['is_weekend'].astype(str)
    
    # How does an offer's historical popularity compare to the average for its type?
    if 'offer_historical_ctr' in df.columns and 'offer_type_code' in df.columns:
        avg_ctr_by_type = df.groupby('offer_type_code')['offer_historical_ctr'].transform('mean')
        df['offer_ctr_vs_type_avg'] = df['offer_historical_ctr'] / (avg_ctr_by_type + 1e-6) # Add epsilon to avoid division by zero

    # This column is no longer needed after feature extraction
    df = df.drop(columns=['impression_time'])
    
    return df

# --- Add features to both the training and test dataframes ---
print("Enriching training data with new features...")
df_train_final = create_advanced_features(df_train)
print("Enriching test data with new features...")
df_test_final = create_advanced_features(df_test)


# =============================================================================
# STAGE 3: SAVE THE FINAL ENRICHED DATASETS
# =============================================================================
print("\n--- STAGE 3: Saving Final Datasets with All New Features ---")

# --- Verify the new columns were added ---
new_cols = [
    'hour_of_day', 'day_of_week', 'is_weekend',
    'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
    'offer_type_x_weekend', 'offer_ctr_vs_type_avg'
]
print("\nColumns added to the training data:")
print([col for col in new_cols if col in df_train_final.columns])
print("\nColumns added to the test data:")
print([col for col in new_cols if col in df_test_final.columns])

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v12.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v12.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- FINAL FEATURE ENGINEERING COMPLETE ---")

--- STAGE 1: Loading Latest Datasets ---
Successfully loaded all required data.
Input training data shape: (770164, 388)
Input test data shape: (369301, 387)

--- STAGE 2: Creating Time-Based and Interaction Features ---
Enriching training data with new features...
Enriching test data with new features...

--- STAGE 3: Saving Final Datasets with All New Features ---

Columns added to the training data:
['hour_of_day', 'day_of_week', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'offer_type_x_weekend', 'offer_ctr_vs_type_avg']

Columns added to the test data:
['hour_of_day', 'day_of_week', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'offer_type_x_weekend', 'offer_ctr_vs_type_avg']

Successfully saved final training data to: /kaggle/working/train_data_v12.parquet
Successfully saved final test data to: /kaggle/working/test_data_v12.parquet

--- FINAL FEATURE ENGINEERING COMPLETE ---


In [3]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings
import gc
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
# INPUT_DIR1 = '/kaggle/input/amex-v7/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD ALL NECESSARY DATA
# =============================================================================
print("--- STAGE 1: Loading All Necessary Data ---")
try:
    # Load your v7 datasets as the starting point
    df_train = pd.read_parquet(f'{WORKING_DIR}train_data_v12.parquet')
    df_test = pd.read_parquet(f'{WORKING_DIR}test_data_v12.parquet')
    
    # Load supplementary data for feature creation
    offer_meta_df = pd.read_parquet(f'{INPUT_DIR}offer_metadata.parquet')
    event_dd = dd.read_parquet(f'{INPUT_DIR}add_event.parquet')
    trans_dd = dd.read_parquet(f'{INPUT_DIR}add_trans.parquet')
    
    print("Successfully loaded all required data.")
    print(f"Input training data shape: {df_train.shape}")
    print(f"Input test data shape: {df_test.shape}")

except FileNotFoundError:
    print("Error: Required data not found. Please ensure v7 files exist and supplementary data is in the input directory.")
    exit()

# =============================================================================
# STAGE 2: CREATE NEW ADVANCED FEATURES
# =============================================================================
print("\n--- STAGE 2: Creating New Advanced Features ---")

# --- Feature 1: Time-to-Click ---
print("Creating 'Time-to-Click' feature...")
event_dd['impression_time'] = dd.to_datetime(event_dd['id4'], errors='coerce')
event_dd['click_time'] = dd.to_datetime(event_dd['id7'], errors='coerce')
# event_dd['time_to_click_seconds'] = (event_dd['click_time'] - event_dd['impression_time']).dt.total_seconds()
# avg_time_to_click = event_dd.groupby('id3')['time_to_click_seconds'].mean().compute().reset_index()
print("'Time-to-Click' feature created.")

# --- Feature 2 & 3: Debit/Credit and Transaction Time Patterns ---
print("Creating industry-level transaction pattern features...")
trans_dd['f371_hour'] = dd.to_datetime(trans_dd['f371'], format='%H:%M:%S', errors='coerce').dt.hour
debit_credit_counts = trans_dd.groupby(['id8', 'f369']).size().compute().unstack(fill_value=0)
if 'C' not in debit_credit_counts.columns: debit_credit_counts['C'] = 0
if 'D' not in debit_credit_counts.columns: debit_credit_counts['D'] = 0
debit_credit_counts.columns = ['credit_trans_count', 'debit_trans_count']
avg_trans_hour = trans_dd.groupby('id8')['f371_hour'].mean().compute().reset_index(name='avg_trans_hour')

# MODIFIED: Added .reset_index() to ensure 'id8' is a column for a correct merge
industry_pattern_features = pd.merge(debit_credit_counts.reset_index(), avg_trans_hour, on='id8', how='outer')
print("Industry pattern features created.")

# --- Feature 4: Offer Body Length ---
print("Creating 'Offer Body Length' feature...")
offer_meta_df['offer_body_length'] = offer_meta_df['f378'].str.len()
offer_body_feature = offer_meta_df[['id3', 'offer_body_length']]
print("'Offer Body Length' feature created.")

# Clean up memory
del event_dd, trans_dd
gc.collect()

# =============================================================================
# STAGE 3: MERGE ALL NEW FEATURES INTO MAIN DATASETS
# =============================================================================
print("\n--- STAGE 3: Merging All New Features ---")

def enrich_dataframe(df, avg_time_to_click, offer_meta_df, industry_pattern_features, offer_body_feature):
    """A reusable function to enrich a dataframe with all new features."""
    
    # Ensure merge keys are the correct type
    df['id3'] = df['id3'].astype(str)
    avg_time_to_click['id3'] = avg_time_to_click['id3'].astype(str)
    offer_meta_df['id3'] = offer_meta_df['id3'].astype(str)
    offer_meta_df['id8'] = offer_meta_df['id8'].astype(str)
    industry_pattern_features['id8'] = industry_pattern_features['id8'].astype(str)
    offer_body_feature['id3'] = offer_body_feature['id3'].astype(str)

    # Merge Time-to-Click
    df = pd.merge(df, avg_time_to_click, on='id3', how='left')
    
    # Merge Industry Patterns (via offer_meta_df)
    offer_meta_with_patterns = pd.merge(offer_meta_df, industry_pattern_features, on='id8', how='left')
    df = pd.merge(df, offer_meta_with_patterns[['id3', 'credit_trans_count', 'debit_trans_count', 'avg_trans_hour']], on='id3', how='left')

    # Merge Offer Body Length
    df = pd.merge(df, offer_body_feature, on='id3', how='left')
    
    return df

# --- Enrich both the training and test dataframes ---
print("Enriching training data...")
df_train_final = enrich_dataframe(df_train, avg_time_to_click, offer_meta_df, industry_pattern_features, offer_body_feature)
print("Enriching test data...")
df_test_final = enrich_dataframe(df_test, avg_time_to_click, offer_meta_df, industry_pattern_features, offer_body_feature)

# =============================================================================
# STAGE 4: SAVE THE FINAL ENRICHED DATASETS
# =============================================================================
print("\n--- STAGE 4: Saving Final Datasets (v8) ---")

# --- Verify the new columns were added ---
new_cols = [
    'time_to_click_seconds', 'credit_trans_count', 'debit_trans_count',
    'avg_trans_hour', 'offer_body_length'
]
print("\nColumns added to the training data:")
print([col for col in new_cols if col in df_train_final.columns])
print("\nColumns added to the test data:")
print([col for col in new_cols if col in df_test_final.columns])

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v13.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v13.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- FINAL FEATURE ENGINEERING COMPLETE ---")

--- STAGE 1: Loading All Necessary Data ---
Successfully loaded all required data.
Input training data shape: (770164, 397)
Input test data shape: (369301, 396)

--- STAGE 2: Creating New Advanced Features ---
Creating 'Time-to-Click' feature...
'Time-to-Click' feature created.
Creating industry-level transaction pattern features...
Industry pattern features created.
Creating 'Offer Body Length' feature...
'Offer Body Length' feature created.

--- STAGE 3: Merging All New Features ---
Enriching training data...
Enriching test data...

--- STAGE 4: Saving Final Datasets (v8) ---

Columns added to the training data:
['time_to_click_seconds', 'credit_trans_count', 'debit_trans_count', 'avg_trans_hour', 'offer_body_length']

Columns added to the test data:
['time_to_click_seconds', 'credit_trans_count', 'debit_trans_count', 'avg_trans_hour', 'offer_body_length']

Successfully saved final training data to: /kaggle/working/train_data_v13.parquet
Successfully saved final test data to: /kaggle

In [4]:
import pandas as pd
import numpy as np
import warnings
import gc

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD THE LATEST TRAINING DATA AND IDENTIFY ZERO-VARIANCE COLUMNS
# =============================================================================
print("--- STAGE 1: Analyzing Training Data for Zero-Variance Features ---")
try:
    # Load your latest training set
    df_train = pd.read_parquet(f'{WORKING_DIR}train_data_v13.parquet')
    print("Successfully loaded train_data_v81.parquet.")
    print(f"Original training data shape: {df_train.shape}")
except FileNotFoundError:
    print("Error: train_data_v81.parquet not found. Please ensure the previous feature engineering script ran successfully.")
    exit()

# --- Identify columns with zero variance based on the new, stricter condition ---
# A column is now considered zero-variance only if it has one unique value AND no missing values.
cols_to_drop = []
for col in df_train.columns:
    # Skip identifier columns and the target
    if col in ['id1', 'id2', 'id3', 'id4', 'id5', 'y']:
        continue
    
    # The new condition: exactly one unique value AND no NaNs.
    if df_train[col].nunique() == 1 and df_train[col].isnull().sum() == 0:
        cols_to_drop.append(col)

print(f"\nFound {len(cols_to_drop)} columns with zero variance to remove based on the new condition.")
if cols_to_drop:
    print("Columns to be removed:", cols_to_drop)

# --- Drop the identified columns ---
df_train_v9 = df_train.drop(columns=cols_to_drop)
print(f"\nNew training data shape after removing zero-variance columns: {df_train_v9.shape}")


# =============================================================================
# STAGE 2: APPLY THE SAME CLEANING TO THE TEST DATA
# =============================================================================
print("\n--- STAGE 2: Applying the Same Cleaning to the Test Data ---")
try:
    # Load your latest test set
    df_test = pd.read_parquet(f'{WORKING_DIR}test_data_v13.parquet')
    print("Successfully loaded test_data_v81.parquet.")
    print(f"Original test data shape: {df_test.shape}")
except FileNotFoundError:
    print("Error: test_data_v81.parquet not found.")
    exit()

# --- Drop the same columns from the test set for consistency ---
# We need to find which of the columns to drop actually exist in the test set
cols_to_drop_in_test = [col for col in cols_to_drop if col in df_test.columns]
df_test_v9 = df_test.drop(columns=cols_to_drop_in_test)
print(f"\nRemoved {len(cols_to_drop_in_test)} columns from the test set.")
print(f"New test data shape: {df_test_v9.shape}")


# =============================================================================
# STAGE 3: SAVE THE FINAL (v9) DATASETS
# =============================================================================
print("\n--- STAGE 3: Saving Final v9 Datasets ---")
try:
    train_output_path = f'{WORKING_DIR}train_data_v14.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v14.parquet'
    
    df_train_v9.to_parquet(train_output_path)
    df_test_v9.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- FINAL CLEANING COMPLETE ---")

--- STAGE 1: Analyzing Training Data for Zero-Variance Features ---
Successfully loaded train_data_v81.parquet.
Original training data shape: (770164, 402)

Found 1 columns with zero variance to remove based on the new condition.
Columns to be removed: ['is_weekend']

New training data shape after removing zero-variance columns: (770164, 401)

--- STAGE 2: Applying the Same Cleaning to the Test Data ---
Successfully loaded test_data_v81.parquet.
Original test data shape: (369301, 401)

Removed 1 columns from the test set.
New test data shape: (369301, 400)

--- STAGE 3: Saving Final v9 Datasets ---

Successfully saved final training data to: /kaggle/working/train_data_v14.parquet
Successfully saved final test data to: /kaggle/working/test_data_v14.parquet

--- FINAL CLEANING COMPLETE ---


In [5]:
import pandas as pd
import numpy as np
import warnings
import gc
import os

# Modeling libraries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD DATA
# =============================================================================
print("--- STAGE 1: Loading Data ---")
try:
    # Load your latest training and test sets
    df_train = pd.read_parquet(f'{WORKING_DIR}train_data_v14.parquet')
    df_test = pd.read_parquet(f'{WORKING_DIR}test_data_v14.parquet')
    
    # Load supplementary data for feature creation
    offer_meta_df = pd.read_parquet(f'{INPUT_DIR}offer_metadata.parquet')
    event_df = pd.read_parquet(f'{INPUT_DIR}add_event.parquet')
    trans_df = pd.read_parquet(f'{INPUT_DIR}add_trans.parquet')
    
    print("Successfully loaded all required data.")
except FileNotFoundError:
    print("Error: Required data not found. Please ensure all necessary files are in the correct directories.")
    exit()

# =============================================================================
# STAGE 2: OFFER CLUSTERING
# =============================================================================
print("\n--- STAGE 2: Creating Offer Clusters ---")

# --- a. Create features for each offer ---
print("Aggregating historical features for each offer...")
event_df['impression_time'] = pd.to_datetime(event_df['id4'], errors='coerce')
event_df['hour_of_day'] = event_df['impression_time'].dt.hour
event_df['is_weekend'] = (event_df['impression_time'].dt.dayofweek >= 5).astype(int)
event_df['clicked'] = event_df['id7'].notna().astype(int)

# Aggregate stats for each offer
offer_features = event_df.groupby('id3').agg(
    offer_historical_ctr=('clicked', 'mean'),
    offer_total_impressions=('id4', 'count'),
    avg_impression_hour=('hour_of_day', 'mean'),
    weekend_impression_ratio=('is_weekend', 'mean')
).reset_index()

# Merge with static offer metadata
offer_meta_df['id3'] = offer_meta_df['id3'].astype(str)
offer_features['id3'] = offer_features['id3'].astype(str)
offer_features = pd.merge(offer_features, offer_meta_df[['id3', 'f375', 'f376']], on='id3', how='left')
offer_features = offer_features.rename(columns={'f375': 'offer_redemption_freq', 'f376': 'offer_discount_rate'})

# --- b. Preprocess and Cluster Offers ---
print("Preprocessing and clustering offers...")
offer_feature_cols = [col for col in offer_features.columns if col != 'id3']
X_offer = offer_features[offer_feature_cols].copy()

imputer = SimpleImputer(strategy='median')
X_offer_imputed = imputer.fit_transform(X_offer)
scaler = StandardScaler()
X_offer_scaled = scaler.fit_transform(X_offer_imputed)

pca = PCA(n_components=0.95) # Explain 95% of variance
X_offer_pca = pca.fit_transform(X_offer_scaled)
print(f"Offer PCA resulted in {pca.n_components_} components.")

kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
offer_features['offer_cluster'] = kmeans.fit_predict(X_offer_pca)
print("Offer clusters created successfully.")


# =============================================================================
# STAGE 3: INDUSTRY CLUSTERING
# =============================================================================
print("\n--- STAGE 3: Creating Industry Clusters ---")

# --- a. Create features for each industry ---
print("Aggregating historical features for each industry...")
trans_df['f367'] = pd.to_numeric(trans_df['f367'], errors='coerce')
industry_features = trans_df.groupby('id8').agg(
    industry_avg_spend=('f367', 'mean'),
    industry_std_spend=('f367', 'std'),
    industry_total_transactions=('f367', 'count'),
    industry_unique_products=('f368', 'nunique')
).reset_index()

# --- b. Preprocess and Cluster Industries ---
print("Preprocessing and clustering industries...")
industry_feature_cols = [col for col in industry_features.columns if col != 'id8']
X_industry = industry_features[industry_feature_cols].copy()

imputer = SimpleImputer(strategy='median')
X_industry_imputed = imputer.fit_transform(X_industry)
scaler = StandardScaler()
X_industry_scaled = scaler.fit_transform(X_industry_imputed)

pca = PCA(n_components=0.95)
X_industry_pca = pca.fit_transform(X_industry_scaled)
print(f"Industry PCA resulted in {pca.n_components_} components.")

kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
industry_features['industry_cluster'] = kmeans.fit_predict(X_industry_pca)
print("Industry clusters created successfully.")


# =============================================================================
# STAGE 4: MERGE NEW FEATURES AND SAVE
# =============================================================================
print("\n--- STAGE 4: Merging New Cluster Features and Saving ---")

# --- Link industry clusters to offers ---
offer_meta_df['id8'] = offer_meta_df['id8'].astype(str)
industry_features['id8'] = industry_features['id8'].astype(str)
offer_meta_with_clusters = pd.merge(offer_meta_df, industry_features[['id8', 'industry_cluster']], on='id8', how='left')

# --- Merge all new features into train and test sets ---
def enrich_with_entity_clusters(df, offer_features, offer_meta_with_clusters):
    df['id3'] = df['id3'].astype(str)
    # Merge offer clusters
    df = pd.merge(df, offer_features[['id3', 'offer_cluster']], on='id3', how='left')
    # Merge industry clusters (via offer metadata)
    df = pd.merge(df, offer_meta_with_clusters[['id3', 'industry_cluster']], on='id3', how='left')
    return df

print("Enriching training data...")
df_train_final = enrich_with_entity_clusters(df_train, offer_features, offer_meta_with_clusters)
print("Enriching test data...")
df_test_final = enrich_with_entity_clusters(df_test, offer_features, offer_meta_with_clusters)

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v15.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v15.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- ENTITY CLUSTER FEATURE ENGINEERING COMPLETE ---")

--- STAGE 1: Loading Data ---
Successfully loaded all required data.

--- STAGE 2: Creating Offer Clusters ---
Aggregating historical features for each offer...
Preprocessing and clustering offers...
Offer PCA resulted in 6 components.
Offer clusters created successfully.

--- STAGE 3: Creating Industry Clusters ---
Aggregating historical features for each industry...
Preprocessing and clustering industries...
Industry PCA resulted in 4 components.
Industry clusters created successfully.

--- STAGE 4: Merging New Cluster Features and Saving ---
Enriching training data...
Enriching test data...

Successfully saved final training data to: /kaggle/working/train_data_v15.parquet
Successfully saved final test data to: /kaggle/working/test_data_v15.parquet

--- ENTITY CLUSTER FEATURE ENGINEERING COMPLETE ---


In [6]:
df_train_final.shape

(770164, 403)

In [7]:
df_train_final.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,day_cos,offer_type_x_weekend,offer_ctr_vs_type_avg,time_to_click_seconds,credit_trans_count,debit_trans_count,avg_trans_hour,offer_body_length,offer_cluster,industry_cluster
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,-0.900969,1_0,2.115142,40.884705,34916.0,2556.0,8.4913,31.0,4,1.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,-0.222521,1_0,1.642199,55.625598,24392.0,205.0,7.362727,37.0,4,1.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,-0.222521,1_0,1.46548,60.164669,19232.0,177.0,10.023546,31.0,4,1.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,-0.900969,1_0,1.512121,65.115491,49247.0,7817.0,11.758745,31.0,4,1.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,-0.900969,1_0,1.502907,62.968459,13298.0,707.0,9.883684,31.0,4,1.0


In [9]:
import gc

# List of common large objects we created in previous steps
# Add any other large variables you might have created
variables_to_delete = [
    'df', 'df_test', 'X', 'y', 'groups', 'df_train', 'df_test_refined',
    'df_sample', 'X_cluster', 'X_scaled', 'X_pca', 'df_cluster', 'df_train_v9', 'df_test_v9',
    'rfm_df', 'trans_dd', 'event_dd', 'offer_meta_df', 'data_dict', 'df_train_final', 'df_test_final'
]

print("--- Clearing RAM before training ---")
for var_name in variables_to_delete:
    if var_name in globals():
        print(f"Deleting: {var_name}")
        del globals()[var_name]

# Call the garbage collector to release the memory
gc.collect()
print("RAM cleared successfully.")

--- Clearing RAM before training ---
Deleting: df_train_v9
Deleting: df_test_v9
Deleting: df_train_final
Deleting: df_test_final
RAM cleared successfully.


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import warnings
import gc
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'
# =============================================================================
# STAGE 1: DATA LOADING AND PREPARATION FOR XGBRANKER
# =============================================================================
print("--- STAGE 1: Loading and Preparing Data for XGBRanker ---")
try:
    # Load your final, pre-cleaned training and test data
    df = pd.read_parquet(f'/kaggle/input/amex-v15/train_data_v15.parquet')
    df_test = pd.read_parquet(f'/kaggle/input/amex-v15/test_data_v15.parquet')
    print("Successfully loaded final training and test data.")
except FileNotFoundError:
    print("Error: v92 data not found. Please run the final cleaning script first.")
    exit()

--- STAGE 1: Loading and Preparing Data for XGBRanker ---
Successfully loaded final training and test data.


In [2]:
df = df.drop(columns=['time_to_click_seconds'], errors='ignore')
df_test = df_test.drop(columns=['time_to_click_seconds'], errors='ignore')

In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import warnings
import gc
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 0: DEFINE THE MAP@7 EVALUATION METRIC
# =============================================================================
def map_at_k(y_true, y_pred, group_ids, k=7):
    """
    Calculates the Mean Average Precision at k.
    
    Args:
        y_true (array-like): The true relevance labels (0 or 1).
        y_pred (array-like): The predicted ranking scores.
        group_ids (array-like): The group/query IDs for each sample.
        k (int): The cutoff for the precision calculation.
        
    Returns:
        float: The Mean Average Precision @ k score.
    """
    df = pd.DataFrame({'group': group_ids, 'y_true': y_true, 'y_pred': y_pred})
    
    average_precisions = []
    for group in df['group'].unique():
        group_df = df[df['group'] == group].sort_values('y_pred', ascending=False).reset_index(drop=True)
        
        # Get the top k predictions
        group_df = group_df.head(k)
        
        if group_df['y_true'].sum() == 0:
            average_precisions.append(0)
            continue
            
        relevant_hits = 0
        precision_at_i = []
        for i, row in group_df.iterrows():
            if row['y_true'] == 1:
                relevant_hits += 1
                precision_at_i.append(relevant_hits / (i + 1))
        
        if not precision_at_i:
            average_precisions.append(0)
        else:
            average_precisions.append(np.mean(precision_at_i))
            
    return np.mean(average_precisions)


# # =============================================================================
# # STAGE 1: DATA LOADING AND PREPARATION FOR XGBRANKER
# # =============================================================================
# print("--- STAGE 1: Loading and Preparing Data for XGBRanker ---")
# try:
#     # Load your final, pre-cleaned training and test data
#     df = pd.read_parquet(f'{WORKING_DIR}train_data_v92.parquet')
#     df_test = pd.read_parquet(f'{WORKING_DIR}test_data_v92.parquet')
#     print("Successfully loaded final training (v92) and test (v92) data.")
# except FileNotFoundError:
#     print("Error: v92 data not found. Please run the final cleaning script first.")
#     exit()

# --- Prepare Feature and Target Names ---
target_col = 'y'
# All columns are features except for identifiers and the target
feature_cols = [col for col in df.columns if col not in ['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'id8']]
# Identify all categorical features to be one-hot encoded
categorical_features = [col for col in df.columns if 'cluster' in col or col == 'offer_type_code' or col == 'offer_type_x_weekend']

# --- Preprocessing ---
print("Preprocessing data for training...")
# Handle one-hot encoding for categorical features
df = pd.get_dummies(df, columns=categorical_features, dummy_na=True)
df_test = pd.get_dummies(df_test, columns=categorical_features, dummy_na=True)

# Update feature list after one-hot encoding
for cat_col in categorical_features:
    if cat_col in feature_cols:
        feature_cols.remove(cat_col)
dummy_cols = [col for col in df.columns if any(cat_col in col for cat_col in categorical_features)]
feature_cols.extend(dummy_cols)
# Remove any duplicates that might have been added
feature_cols = list(dict.fromkeys(feature_cols)) 

# Convert all feature columns to numeric before imputation
print("Converting all feature columns to numeric types...")
for col in feature_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    if col in df_test.columns:
        df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

# Final imputation for training data
df[feature_cols] = df[feature_cols].fillna(-999)

# --- Create Groups for XGBRanker ---
# A "query" or "group" is a single ranking task (all offers for one customer on one day)
df['group_id'] = df['id2'].astype(str) + '_' + df['id5'].astype(str)
df = df.sort_values('group_id').reset_index(drop=True)
group_sizes = df.groupby('group_id')['id1'].count().to_numpy()

# Define features (X) and target (y)
X = df[feature_cols]
y = df[target_col].astype(int)
print(f"Prepared training data with {len(feature_cols)} features and {len(group_sizes)} groups.")


# =============================================================================
# STAGE 2: TRAINING FINAL ENSEMBLE WITH XGBRANKER
# =============================================================================
print("\n--- STAGE 2: Training Final Ensemble of XGBRanker Models ---")

# Define XGBRanker parameters. Note the 'rank:ndcg' objective.
final_params = {
    'objective': 'rank:ndcg', # Learning-to-rank objective
    'eval_metric': 'ndcg@7',  # Evaluate using a metric similar to the competition
    'use_label_encoder': False, 'seed': 42,
    'tree_method': 'gpu_hist', 'gpu_id': 0,
    'n_estimators': 1000, 
    'learning_rate': 0.05, 
    'max_depth': 7, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8
}

N_SPLITS = 5 # Using 5 folds for a balance of stability and speed
gkf_train = GroupKFold(n_splits=N_SPLITS)
oof_predictions = np.zeros(len(df))
oof_map_scores = []

print(f"Starting final training with {N_SPLITS}-Fold GroupKFold on GPU...")
# For XGBRanker, we must use the customer ID for the GroupKFold split
groups_for_split = df['id2']

# MODIFIED: Corrected the variable name from 'for_split' to 'groups_for_split'
for fold, (train_idx, val_idx) in enumerate(gkf_train.split(X, y, groups=groups_for_split)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
    
    # Get the group sizes for the training and validation sets
    train_groups = df.iloc[train_idx].groupby('group_id')['id1'].count().to_numpy()
    val_groups = df.iloc[val_idx].groupby('group_id')['id1'].count().to_numpy()
    
    model = xgb.XGBRanker(**final_params)
    model.fit(X_train_fold, y_train_fold, group=train_groups,
              eval_set=[(X_val_fold, y_val_fold)], eval_group=[val_groups],
              early_stopping_rounds=50, verbose=False)
    
    # XGBRanker outputs scores, not probabilities
    val_preds = model.predict(X_val_fold)
    oof_predictions[val_idx] = val_preds

    # --- Calculate and store MAP@7 for this fold ---
    fold_map_score = map_at_k(y_val_fold, val_preds, df.iloc[val_idx]['group_id'], k=7)
    oof_map_scores.append(fold_map_score)
    print(f"Fold {fold+1} MAP@7 Score: {fold_map_score:.5f}")

    model_path = f'{WORKING_DIR}final_ranker_model_fold_{fold+1}.json'
    model.save_model(model_path)
    print(f"Model for fold {fold+1} saved to {model_path}")
    del X_train_fold, y_train_fold, X_val_fold, y_val_fold, model
    gc.collect()

# --- Evaluate Overall Performance ---
print(f"\n--- Overall Cross-Validation Results ---")
print(f"Mean Out-of-Fold (OOF) MAP@7 Score: {np.mean(oof_map_scores):.5f}")
print("This score is the most reliable estimate of your final leaderboard performance.")


# =============================================================================
# STAGE 3: PREDICTION AND SUBMISSION FILE CREATION
# =============================================================================
print("\n--- STAGE 3: Generating Final Predictions ---")
# Prepare test data
submission_ids = df_test[['id1', 'id2', 'id3', 'id5']].copy()
# Align test columns with the training columns
X_test = df_test.reindex(columns=X.columns, fill_value=0)
# Final imputation for test data
X_test = X_test.fillna(-999)

# Load models and predict
test_predictions = np.zeros(len(X_test))
for fold in range(1, N_SPLITS + 1):
    print(f"Predicting with Fold {fold}/{N_SPLITS}...")
    model_path = f'{WORKING_DIR}final_ranker_model_fold_{fold}.json'
    model = xgb.XGBRanker()
    model.load_model(model_path)
    # Predict outputs ranking scores
    test_predictions += model.predict(X_test) / N_SPLITS

# Create submission file
print("\nCreating submission file...")
submission_df = submission_ids.copy()
submission_df['id5'] = pd.to_datetime(submission_df['id5'], errors='coerce').dt.strftime('%m-%d-%Y')
cleaned_predictions = np.nan_to_num(test_predictions, nan=-999) # Use a low score for NaNs
submission_df['pred'] = cleaned_predictions

submission_path = f'{WORKING_DIR}r2_submission_final_ranker.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission file successfully saved to: {submission_path}")
print("\nFirst 5 rows of the submission file:")
print(submission_df.head())



--- STAGE 2: Training Final Ensemble of XGBRanker Models ---
Starting final training with 5-Fold GroupKFold on GPU...
--- Fold 1/5 ---
Fold 1 MAP@7 Score: 0.06066
Model for fold 1 saved to /kaggle/working/final_ranker_model_fold_1.json
--- Fold 2/5 ---
Fold 2 MAP@7 Score: 0.06649
Model for fold 2 saved to /kaggle/working/final_ranker_model_fold_2.json
--- Fold 3/5 ---
Fold 3 MAP@7 Score: 0.06375
Model for fold 3 saved to /kaggle/working/final_ranker_model_fold_3.json
--- Fold 4/5 ---
Fold 4 MAP@7 Score: 0.06472
Model for fold 4 saved to /kaggle/working/final_ranker_model_fold_4.json
--- Fold 5/5 ---
Fold 5 MAP@7 Score: 0.06350
Model for fold 5 saved to /kaggle/working/final_ranker_model_fold_5.json

--- Overall Cross-Validation Results ---
Mean Out-of-Fold (OOF) MAP@7 Score: 0.06382
This score is the most reliable estimate of your final leaderboard performance.

--- STAGE 3: Generating Final Predictions ---
Predicting with Fold 1/5...
Predicting with Fold 2/5...
Predicting with Fold 3

In [None]:
# =============================================================================
# SECTION 0: SETUP AND IMPORTS
# =============================================================================
print("--- Section 0: Setting up the environment ---")
# Install the necessary libraries for graph and sequential embeddings
!pip install -q node2vec gensim

import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings
import gc
import os
import networkx as nx
from node2vec import Node2Vec
from gensim.models import Word2Vec

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD DATA
# =============================================================================
print("\n--- Section 1: Loading Data ---")
try:
    # Load your latest training and test sets
    df_train = pd.read_parquet(f'/kaggle/input/amex-v15/train_data_v15.parquet')
    df_test = pd.read_parquet(f'/kaggle/input/amex-v15/test_data_v15.parquet')
    
    # Use Dask for the large event log
    event_dd = dd.read_parquet(f'{INPUT_DIR}add_event.parquet', columns=['id2', 'id3', 'id4'])
    
    print("Successfully loaded all required data.")
except FileNotFoundError:
    print("Error: Required data not found. Please ensure all necessary files are in the correct directories.")
    exit()

# =============================================================================
# TECHNIQUE 1: GRAPH EMBEDDINGS (NODEVEC)
# =============================================================================
print("\n--- Technique 1: Creating Graph Embedding Features ---")

# --- Step 1a: Build the Interaction Graph ---
print("Step 1a: Building the customer-offer interaction graph...")
# Compute the edge list (customer, offer pairs) from the Dask DataFrame
# This is the most memory-intensive part of this stage
edge_list = event_dd[['id2', 'id3']].compute()
# Create a graph from the edge list
G = nx.from_pandas_edgelist(edge_list, 'id2', 'id3')
print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

# --- Step 1b: Train the Node2Vec Model ---
print("Step 1b: Training the Node2Vec model (this will take a long time)...")
# Initialize Node2Vec. Parameters can be tuned.
node2vec = Node2Vec(G, dimensions=32, walk_length=20, num_walks=10, workers=4, quiet=True)
# Train the model
model_n2v = node2vec.fit(window=5, min_count=1, batch_words=4)
print("Node2Vec model trained successfully.")

# --- Step 1c: Extract Offer Embeddings ---
print("Step 1c: Extracting offer embeddings...")
offer_ids = df_train['id3'].unique().tolist() + df_test['id3'].unique().tolist()
offer_ids = list(set(offer_ids)) # Get unique offer IDs across train and test

graph_embeddings = []
for offer_id in offer_ids:
    try:
        vector = model_n2v.wv[offer_id]
        graph_embeddings.append([offer_id] + vector.tolist())
    except KeyError:
        # This offer was not in the historical graph
        continue

graph_embedding_features = pd.DataFrame(graph_embeddings, columns=['id3'] + [f'graph_emb_{i}' for i in range(32)])
print("Graph embedding features created successfully.")

# Clean up memory
del G, edge_list, model_n2v, node2vec
gc.collect()

# =============================================================================
# TECHNIQUE 2: SEQUENTIAL EMBEDDINGS (WORD2VEC)
# =============================================================================
print("\n--- Technique 2: Creating Sequential Embedding Features ---")

# --- Step 2a: Create Customer Sessions ---
print("Step 2a: Creating customer sessions...")
event_dd['impression_time'] = dd.to_datetime(event_dd['id4'], errors='coerce')

# MODIFIED: Use a robust, Dask-native method to create time-based sessions
# Sort by customer and time
event_dd = event_dd.set_index('impression_time').sort_index()
event_dd = event_dd.reset_index()
event_dd = event_dd.set_index('id2').sort_index()

# Calculate time difference between consecutive events for each customer
event_dd['time_diff'] = event_dd.groupby('id2')['impression_time'].diff().dt.total_seconds()

# A new session starts if the time difference is > 30 minutes (1800 seconds)
event_dd['new_session'] = (event_dd['time_diff'] > 1800).fillna(True)
event_dd['session_id'] = event_dd.groupby('id2')['new_session'].cumsum()

# Create the list of sessions (sequences of offers)
sessions = event_dd.groupby(['id2', 'session_id'])['id3'].apply(list, meta=('id3', 'object')).compute().tolist()
print(f"Created {len(sessions)} customer sessions.")

# --- Step 2b: Train the Word2Vec Model ---
print("Step 2b: Training the Word2Vec model...")
# Train the model on the sessions. Treats offers as "words" and sessions as "sentences".
model_w2v = Word2Vec(sentences=sessions, vector_size=32, window=5, min_count=1, workers=4)
print("Word2Vec model trained successfully.")

# --- Step 2c: Extract Offer Embeddings ---
print("Step 2c: Extracting sequential embeddings...")
sequential_embeddings = []
for offer_id in offer_ids:
    try:
        vector = model_w2v.wv[offer_id]
        sequential_embeddings.append([offer_id] + vector.tolist())
    except KeyError:
        continue

sequential_embedding_features = pd.DataFrame(sequential_embeddings, columns=['id3'] + [f'seq_emb_{i}' for i in range(32)])
print("Sequential embedding features created successfully.")

# Clean up memory
del sessions, model_w2v
gc.collect()

# =============================================================================
# STAGE 3: MERGE NEW FEATURES AND SAVE
# =============================================================================
print("\n--- STAGE 3: Merging New Features and Saving Final Datasets (v11) ---")

def enrich_dataframe(df, graph_embedding_features, sequential_embedding_features):
    """A reusable function to enrich a dataframe with new embedding features."""
    df['id3'] = df['id3'].astype(str)
    graph_embedding_features['id3'] = graph_embedding_features['id3'].astype(str)
    sequential_embedding_features['id3'] = sequential_embedding_features['id3'].astype(str)
    
    # Merge graph embedding features
    df = pd.merge(df, graph_embedding_features, on='id3', how='left')
    # Merge sequential embedding features
    df = pd.merge(df, sequential_embedding_features, on='id3', how='left')
    
    return df

# --- Enrich both the training and test dataframes ---
print("Enriching training data...")
df_train_final = enrich_dataframe(df_train, graph_embedding_features, sequential_embedding_features)
print("Enriching test data...")
df_test_final = enrich_dataframe(df_test, graph_embedding_features, sequential_embedding_features)

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v16.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v16.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- ADVANCED EMBEDDING FEATURE ENGINEERING COMPLETE ---")

--- Section 0: Setting up the environment ---
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
dopamine-rl 4.1.2 requires gymnasium>=1.0.0, but you have gymnasium 0.29.0 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.2.2 which is incompatible.
plotnine 0.14.5 requires matplotlib>=3.8.0, but you have matplotlib 3.7.2 which is incompatible.


In [1]:
# =============================================================================
# SECTION 0: SETUP AND IMPORTS
# =============================================================================
print("--- Section 0: Setting up the environment ---")
# Install the necessary library for text embeddings
!pip install -q sentence-transformers

import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings
import gc
import os

from sentence_transformers import SentenceTransformer

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD DATA
# =============================================================================
print("\n--- Section 1: Loading Data ---")
try:
    # Load your latest training and test sets
    df_train = pd.read_parquet(f'/kaggle/input/amex-v15/train_data_v15.parquet')
    df_test = pd.read_parquet(f'/kaggle/input/amex-v15/test_data_v15.parquet')
    
    # Load supplementary data for feature creation
    offer_meta_df = pd.read_parquet(f'{INPUT_DIR}offer_metadata.parquet')
    event_dd = dd.read_parquet(f'{INPUT_DIR}add_event.parquet')
    
    print("Successfully loaded all required data.")
except FileNotFoundError:
    print("Error: Required data not found. Please ensure all necessary files are in the correct directories.")
    exit()

# =============================================================================
# TECHNIQUE 1: CO-CLICK & SEQUENTIAL FEATURES (MEMORY-EFFICIENT)
# =============================================================================
print("\n--- Technique 1: Creating Co-Click Features ---")

# --- Step 1a: Create Customer Sessions on a Sample ---
print("Step 1a: Creating customer sessions on a sample of the data...")
# Use a large sample of the event data to make the process much faster
event_dd_sample = event_dd.sample(frac=0.15, random_state=42)
event_dd_sample['impression_time'] = dd.to_datetime(event_dd_sample['id4'], errors='coerce')

# Use a robust, Dask-native method to create time-based sessions
print("Sorting events by customer and time...")
event_dd_sample = event_dd_sample.sort_values(['id2', 'impression_time'])

def calculate_time_diff(df):
    df['time_diff'] = df.groupby('id2')['impression_time'].diff().dt.total_seconds()
    return df

meta = event_dd_sample._meta.copy()
meta['time_diff'] = pd.Series(dtype='float64')
event_dd_sample = event_dd_sample.map_partitions(calculate_time_diff, meta=meta)

event_dd_sample['new_session'] = (event_dd_sample['time_diff'] > 1800).fillna(True)
event_dd_sample['session_id'] = event_dd_sample.groupby('id2')['new_session'].cumsum()
print("Customer sessions created.")


# --- Step 1b: Calculate Co-occurrence (Fully with Dask) ---
print("Step 1b: Calculating offer co-occurrence within sessions using Dask...")
# MODIFIED: Perform the entire co-occurrence calculation within Dask's lazy framework
session_offers_dd = event_dd_sample[['session_id', 'id2', 'id3']]
# Perform a Dask self-merge
merged_sessions_dd = dd.merge(session_offers_dd, session_offers_dd, on=['session_id', 'id2'])
# Filter out self-pairs
co_occurrence_dd = merged_sessions_dd[merged_sessions_dd['id3_x'] != merged_sessions_dd['id3_y']]
# Count how many times each pair appears and compute the final small result
co_occurrence_counts = co_occurrence_dd.groupby(['id3_x', 'id3_y']).size().compute().reset_index(name='count')
print("Co-occurrence calculated successfully.")


--- Section 0: Setting up the environment ---


2025-07-20 14:18:58.503146: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753021138.720674     210 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753021138.784423     210 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



--- Section 1: Loading Data ---
Successfully loaded all required data.

--- Technique 1: Creating Co-Click Features ---
Step 1a: Creating customer sessions on a sample of the data...
Sorting events by customer and time...
Customer sessions created.
Step 1b: Calculating offer co-occurrence within sessions using Dask...
Co-occurrence calculated successfully.
Step 1c: Creating final co-click features...


AttributeError: 'Series' object has no attribute 'notna'

In [2]:
# --- Step 1c: Create Co-Click Features ---
print("Step 1c: Creating final co-click features...")
# MODIFIED: Changed .notna() to the Dask-compatible ~.isnull() to fix the AttributeError
event_dd['clicked'] = (~event_dd['id7'].isnull()).astype(int)
offer_ctr = event_dd.groupby('id3')['clicked'].mean().compute().to_dict()

# For each offer, find its most frequent partner
co_occurrence_counts = co_occurrence_counts.sort_values('count', ascending=False)
most_frequent_partner = co_occurrence_counts.drop_duplicates(subset=['id3_x'], keep='first')

# Get the CTR of that most frequent partner
most_frequent_partner['partner_ctr'] = most_frequent_partner['id3_y'].map(offer_ctr)
co_click_features = most_frequent_partner[['id3_x', 'partner_ctr']].rename(columns={'id3_x': 'id3', 'partner_ctr': 'most_frequent_partner_ctr'})
print("Co-click features created successfully.")

# Clean up memory
del event_dd, event_dd_sample, co_occurrence_counts
gc.collect()

# =============================================================================
# TECHNIQUE 2: TEXT EMBEDDINGS
# =============================================================================
print("\n--- Technique 2: Creating Text Embedding Features ---")

# --- Step 2a: Prepare the Text Corpus ---
print("Step 2a: Preparing text corpus from offer metadata...")
offer_meta_df['id3'] = offer_meta_df['id3'].astype(str)
# Combine text fields, handling missing values
offer_meta_df['full_text'] = offer_meta_df['id9'].fillna('') + ' ' + offer_meta_df['f378'].fillna('')
offer_texts = offer_meta_df[['id3', 'full_text']].copy()
print("Text corpus prepared.")

# --- Step 2b: Generate Embeddings ---
print("Step 2b: Generating text embeddings (this may take a moment)...")
# Use a lightweight but powerful pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Generate the embeddings
embeddings = model.encode(offer_texts['full_text'].tolist(), show_progress_bar=True)
print(f"Embeddings generated with shape: {embeddings.shape}")

# Create a DataFrame from the embeddings
embedding_df = pd.DataFrame(embeddings, columns=[f'text_emb_{i}' for i in range(embeddings.shape[1])])
text_embedding_features = pd.concat([offer_texts[['id3']], embedding_df], axis=1)
print("Text embedding features created successfully.")


# =============================================================================
# STAGE 3: MERGE NEW FEATURES AND SAVE
# =============================================================================
print("\n--- STAGE 3: Merging New Features and Saving Final Datasets (v10) ---")

def enrich_dataframe(df, co_click_features, text_embedding_features):
    """A reusable function to enrich a dataframe with new features."""
    df['id3'] = df['id3'].astype(str)
    co_click_features['id3'] = co_click_features['id3'].astype(str)
    text_embedding_features['id3'] = text_embedding_features['id3'].astype(str)
    
    # Merge co-click features
    df = pd.merge(df, co_click_features, on='id3', how='left')
    # Merge text embedding features
    df = pd.merge(df, text_embedding_features, on='id3', how='left')
    
    return df

# --- Enrich both the training and test dataframes ---
print("Enriching training data...")
df_train_final = enrich_dataframe(df_train, co_click_features, text_embedding_features)
print("Enriching test data...")
df_test_final = enrich_dataframe(df_test, co_click_features, text_embedding_features)

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v16.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v16.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- ADVANCED FEATURE ENGINEERING COMPLETE ---")

Step 1c: Creating final co-click features...
Co-click features created successfully.

--- Technique 2: Creating Text Embedding Features ---
Step 2a: Preparing text corpus from offer metadata...
Text corpus prepared.
Step 2b: Generating text embeddings (this may take a moment)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

Embeddings generated with shape: (4164, 384)
Text embedding features created successfully.

--- STAGE 3: Merging New Features and Saving Final Datasets (v10) ---
Enriching training data...
Enriching test data...

Successfully saved final training data to: /kaggle/working/train_data_v16.parquet
Successfully saved final test data to: /kaggle/working/test_data_v16.parquet

--- ADVANCED FEATURE ENGINEERING COMPLETE ---


In [4]:
df_train_final.shape

(770164, 788)

In [2]:
import gc

# List of large DataFrames and variables created in the previous script
variables_to_delete = [
    'df_train', 'df_test', 'offer_meta_df', 'event_dd', 
    'event_dd_sample', 'session_offers', 'merged_sessions', 
    'co_occurrence', 'co_occurrence_counts', 'event_df_pd', 
    'offer_ctr', 'most_frequent_partner', 'co_click_features',
    'offer_texts', 'embeddings', 'embedding_df', 
    'text_embedding_features', 'df_train_final', 'df_test_final'
]

print("--- Clearing RAM ---")
for var_name in variables_to_delete:
    # Check if the variable exists in the notebook's memory before trying to delete it
    if var_name in globals():
        print(f"Deleting: {var_name}")
        del globals()[var_name]

# Call the garbage collector to release the memory
gc.collect()
print("RAM cleared successfully.")

--- Clearing RAM ---
Deleting: df_train
Deleting: df_test
Deleting: offer_meta_df
Deleting: offer_ctr
Deleting: most_frequent_partner
Deleting: co_click_features
Deleting: offer_texts
Deleting: embeddings
Deleting: df_train_final
Deleting: df_test_final
RAM cleared successfully.


In [1]:
# =============================================================================
# SECTION 0: SETUP AND IMPORTS
# =============================================================================
print("--- Section 0: Setting up the environment ---")
# Install the necessary library for text embeddings
!pip install -q sentence-transformers

import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings
import gc
import os

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD DATA
# =============================================================================
print("\n--- Section 1: Loading Data ---")
try:
    # Load your latest training and test sets
    df_train = pd.read_parquet(f'/kaggle/input/amex-v15/train_data_v15.parquet')
    df_test = pd.read_parquet(f'/kaggle/input/amex-v15/test_data_v15.parquet')
    
    # Load supplementary data for feature creation
    offer_meta_df = pd.read_parquet(f'{INPUT_DIR}offer_metadata.parquet')
    event_dd = dd.read_parquet(f'{INPUT_DIR}add_event.parquet')
    
    print("Successfully loaded all required data.")
except FileNotFoundError:
    print("Error: Required data not found. Please ensure all necessary files are in the correct directories.")
    exit()

# =============================================================================
# TECHNIQUE 1: ADVANCED CO-CLICK FEATURES
# =============================================================================
print("\n--- Technique 1: Creating Advanced Co-Click Features ---")

# --- Step 1a: Create Customer Sessions on a Sample ---
print("Step 1a: Creating customer sessions on a sample of the data...")
event_dd_sample = event_dd.sample(frac=0.2, random_state=42)
event_dd_sample['impression_time'] = dd.to_datetime(event_dd_sample['id4'], errors='coerce')
event_dd_sample = event_dd_sample.sort_values(['id2', 'impression_time'])

def calculate_time_diff(df):
    df['time_diff'] = df.groupby('id2')['impression_time'].diff().dt.total_seconds()
    return df

meta = event_dd_sample._meta.copy()
meta['time_diff'] = pd.Series(dtype='float64')
event_dd_sample = event_dd_sample.map_partitions(calculate_time_diff, meta=meta)

event_dd_sample['new_session'] = (event_dd_sample['time_diff'] > 1800).fillna(True)
event_dd_sample['session_id'] = event_dd_sample.groupby('id2')['new_session'].cumsum()
print("Customer sessions created.")


# --- Step 1b: Calculate Co-occurrence ---
print("Step 1b: Calculating offer co-occurrence within sessions...")
session_offers_dd = event_dd_sample[['session_id', 'id2', 'id3']]
merged_sessions_dd = dd.merge(session_offers_dd, session_offers_dd, on=['session_id', 'id2'])
co_occurrence_dd = merged_sessions_dd[merged_sessions_dd['id3_x'] != merged_sessions_dd['id3_y']]
co_occurrence_counts = co_occurrence_dd.groupby(['id3_x', 'id3_y']).size().compute().reset_index(name='count')
print("Co-occurrence calculated successfully.")


# --- Step 1c: Create Advanced Co-Click Features ---
print("Step 1c: Creating final co-click features...")
event_dd['clicked'] = (~event_dd['id7'].isnull()).astype(int)
offer_ctr = event_dd.groupby('id3')['clicked'].mean().compute().to_dict()

# Get the CTR of each partner offer
co_occurrence_counts['partner_ctr'] = co_occurrence_counts['id3_y'].map(offer_ctr)
co_occurrence_counts = co_occurrence_counts.sort_values(['id3_x', 'count'], ascending=[True, False])

# Find the top 5 partners for each offer
top_5_partners = co_occurrence_counts.groupby('id3_x').head(5)

# Calculate aggregate stats for the top 5 partners' CTRs
co_click_agg = top_5_partners.groupby('id3_x')['partner_ctr'].agg(['mean', 'max', 'std']).reset_index()
co_click_features = co_click_agg.rename(columns={
    'id3_x': 'id3',
    'mean': 'top5_partner_mean_ctr',
    'max': 'top5_partner_max_ctr',
    'std': 'top5_partner_std_ctr'
})
print("Advanced co-click features created successfully.")

# Clean up memory
del event_dd, event_dd_sample, co_occurrence_counts
gc.collect()

# =============================================================================
# TECHNIQUE 2: TEXT SIMILARITY FEATURES
# =============================================================================
print("\n--- Technique 2: Creating Text Similarity Features ---")

# --- Step 2a: Prepare the Text Corpus and Generate Embeddings ---
print("Step 2a: Preparing text corpus and generating embeddings...")
offer_meta_df['id3'] = offer_meta_df['id3'].astype(str)
offer_meta_df['full_text'] = offer_meta_df['id9'].fillna('') + ' ' + offer_meta_df['f378'].fillna('')
offer_texts = offer_meta_df[['id3', 'id8', 'full_text']].copy()

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(offer_texts['full_text'].tolist(), show_progress_bar=True)
embedding_df = pd.DataFrame(embeddings, columns=[f'text_emb_{i}' for i in range(embeddings.shape[1])])
offer_embeddings_df = pd.concat([offer_texts[['id3', 'id8']], embedding_df], axis=1)
print("Text embeddings generated.")

# --- Step 2b: Create Industry DNA and Calculate Similarity ---
print("Step 2b: Calculating offer-industry text similarity...")
# Calculate the average embedding vector for each industry
industry_dna = offer_embeddings_df.groupby('id8').mean(numeric_only=True).reset_index()
industry_dna = industry_dna.rename(columns={col: f'industry_{col}' for col in industry_dna.columns if 'emb' in col})

# Merge the industry DNA back to each offer
offer_embeddings_df = pd.merge(offer_embeddings_df, industry_dna, on='id8', how='left')

# Calculate the cosine similarity between each offer's embedding and its industry's average embedding
emb_cols = [f'text_emb_{i}' for i in range(embeddings.shape[1])]
industry_emb_cols = [f'industry_text_emb_{i}' for i in range(embeddings.shape[1])]

# This is a memory-intensive step, so we process in chunks if needed
# For now, we attempt it directly as the dataframe is moderately sized
offer_embeddings_df['offer_industry_text_similarity'] = [
    cosine_similarity(row[emb_cols].values.reshape(1, -1), row[industry_emb_cols].values.reshape(1, -1))[0][0]
    if not row[industry_emb_cols].isnull().any() else np.nan
    for index, row in offer_embeddings_df.iterrows()
]
text_similarity_features = offer_embeddings_df[['id3', 'offer_industry_text_similarity']]
print("Text similarity features created successfully.")


# =============================================================================
# STAGE 3: MERGE NEW FEATURES AND SAVE
# =============================================================================
print("\n--- STAGE 3: Merging New Features and Saving Final Datasets (v17) ---")

def enrich_dataframe(df, co_click_features, text_similarity_features):
    """A reusable function to enrich a dataframe with new features."""
    df['id3'] = df['id3'].astype(str)
    co_click_features['id3'] = co_click_features['id3'].astype(str)
    text_similarity_features['id3'] = text_similarity_features['id3'].astype(str)
    
    # Merge co-click features
    df = pd.merge(df, co_click_features, on='id3', how='left')
    # Merge text similarity features
    df = pd.merge(df, text_similarity_features, on='id3', how='left')
    
    return df

# --- Enrich both the training and test dataframes ---
print("Enriching training data...")
df_train_final = enrich_dataframe(df_train, co_click_features, text_similarity_features)
print("Enriching test data...")
df_test_final = enrich_dataframe(df_test, co_click_features, text_similarity_features)

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v18.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v18.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- ADVANCED FEATURE ENGINEERING COMPLETE ---")


--- Section 0: Setting up the environment ---
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2

2025-07-20 16:48:57.174338: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753030137.348404      79 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753030137.396031      79 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



--- Section 1: Loading Data ---
Successfully loaded all required data.

--- Technique 1: Creating Advanced Co-Click Features ---
Step 1a: Creating customer sessions on a sample of the data...
Customer sessions created.
Step 1b: Calculating offer co-occurrence within sessions...
Co-occurrence calculated successfully.
Step 1c: Creating final co-click features...
Advanced co-click features created successfully.

--- Technique 2: Creating Text Similarity Features ---
Step 2a: Preparing text corpus and generating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

Text embeddings generated.
Step 2b: Calculating offer-industry text similarity...
Text similarity features created successfully.

--- STAGE 3: Merging New Features and Saving Final Datasets (v17) ---
Enriching training data...
Enriching test data...

Successfully saved final training data to: /kaggle/working/train_data_v18.parquet
Successfully saved final test data to: /kaggle/working/test_data_v18.parquet

--- ADVANCED FEATURE ENGINEERING COMPLETE ---


In [None]:
df_train_final.head()

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import warnings
import gc
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 0: DEFINE THE MAP@7 EVALUATION METRIC
# =============================================================================
def map_at_k(y_true, y_pred, group_ids, k=7):
    """
    Calculates the Mean Average Precision at k.
    
    Args:
        y_true (array-like): The true relevance labels (0 or 1).
        y_pred (array-like): The predicted ranking scores.
        group_ids (array-like): The group/query IDs for each sample.
        k (int): The cutoff for the precision calculation.
        
    Returns:
        float: The Mean Average Precision @ k score.
    """
    df = pd.DataFrame({'group': group_ids, 'y_true': y_true, 'y_pred': y_pred})
    
    average_precisions = []
    for group in df['group'].unique():
        group_df = df[df['group'] == group].sort_values('y_pred', ascending=False).reset_index(drop=True)
        
        # Get the top k predictions
        group_df = group_df.head(k)
        
        if group_df['y_true'].sum() == 0:
            average_precisions.append(0)
            continue
            
        relevant_hits = 0
        precision_at_i = []
        for i, row in group_df.iterrows():
            if row['y_true'] == 1:
                relevant_hits += 1
                precision_at_i.append(relevant_hits / (i + 1))
        
        if not precision_at_i:
            average_precisions.append(0)
        else:
            average_precisions.append(np.mean(precision_at_i))
            
    return np.mean(average_precisions)


# =============================================================================
# STAGE 1: DATA LOADING AND PREPARATION FOR XGBRANKER
# =============================================================================
print("--- STAGE 1: Loading and Preparing Data for XGBRanker ---")
try:
    # Load your final, pre-cleaned training and test data
    df = pd.read_parquet(f'{WORKING_DIR}train_data_v18.parquet')
    df_test = pd.read_parquet(f'{WORKING_DIR}test_data_v18.parquet')
    print("Successfully loaded final training (v92) and test (v92) data.")
except FileNotFoundError:
    print("Error: v92 data not found. Please run the final cleaning script first.")
    exit()

# --- Memory Reduction Function ---
def reduce_mem_usage(df):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)[:3] == 'int' or str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    return df

# --- Prepare Feature and Target Names ---
target_col = 'y'
# All columns are features except for identifiers and the target
feature_cols = [col for col in df.columns if col not in ['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'id8']]
# Identify all categorical features to be one-hot encoded
categorical_features = [col for col in df.columns if 'cluster' in col or col == 'offer_type_code' or col == 'offer_type_x_weekend']

# --- Preprocessing ---
print("Preprocessing data for training...")
# Handle one-hot encoding for categorical features
df = pd.get_dummies(df, columns=categorical_features, dummy_na=True)
df_test = pd.get_dummies(df_test, columns=categorical_features, dummy_na=True)

# Update feature list after one-hot encoding
for cat_col in categorical_features:
    if cat_col in feature_cols:
        feature_cols.remove(cat_col)
dummy_cols = [col for col in df.columns if any(cat_col in col for cat_col in categorical_features)]
feature_cols.extend(dummy_cols)
# Remove any duplicates that might have been added
feature_cols = list(dict.fromkeys(feature_cols)) 

# Convert all feature columns to numeric before imputation
print("Converting all feature columns to numeric types...")
for col in feature_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    if col in df_test.columns:
        df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

# Final imputation for training data
df[feature_cols] = df[feature_cols].fillna(-999)

# MODIFIED: Move memory reduction to AFTER all preprocessing
print("\nOptimizing memory usage for training data...")
df = reduce_mem_usage(df)
print("\nOptimizing memory usage for test data...")
df_test = reduce_mem_usage(df_test)

# --- Create Groups for XGBRanker ---
# A "query" or "group" is a single ranking task (all offers for one customer on one day)
df['group_id'] = df['id2'].astype(str) + '_' + df['id5'].astype(str)
df = df.sort_values('group_id').reset_index(drop=True)
group_sizes = df.groupby('group_id')['id1'].count().to_numpy()

# Define features (X) and target (y)
X = df[feature_cols]
y = df[target_col].astype(int)
print(f"Prepared training data with {len(feature_cols)} features and {len(group_sizes)} groups.")


# =============================================================================
# STAGE 2: TRAINING FINAL ENSEMBLE WITH XGBRANKER
# =============================================================================
print("\n--- STAGE 2: Training Final Ensemble of XGBRanker Models ---")

# Define XGBRanker parameters. Note the 'rank:ndcg' objective.
final_params = {
    'objective': 'rank:ndcg', # Learning-to-rank objective
    'eval_metric': 'ndcg@7',  # Evaluate using a metric similar to the competition
    'use_label_encoder': False, 'seed': 42,
    'tree_method': 'gpu_hist', 'gpu_id': 0,
    'n_estimators': 2000, 
    'learning_rate': 0.05, 
    'max_depth': 8, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8,
    'max_bin': 128 # MODIFIED: Add max_bin to reduce memory usage during training
}

N_SPLITS = 3 # Using 5 folds for a balance of stability and speed
gkf_train = GroupKFold(n_splits=N_SPLITS)
oof_predictions = np.zeros(len(df))
oof_map_scores = []

print(f"Starting final training with {N_SPLITS}-Fold GroupKFold on GPU...")
# For XGBRanker, we must use the customer ID for the GroupKFold split
groups_for_split = df['id2']

for fold, (train_idx, val_idx) in enumerate(gkf_train.split(X, y, groups=groups_for_split)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
    
    # Get the group sizes for the training and validation sets
    train_groups = df.iloc[train_idx].groupby('group_id')['id1'].count().to_numpy()
    val_groups = df.iloc[val_idx].groupby('group_id')['id1'].count().to_numpy()
    
    model = xgb.XGBRanker(**final_params)
    model.fit(X_train_fold, y_train_fold, group=train_groups,
              eval_set=[(X_val_fold, y_val_fold)], eval_group=[val_groups],
              early_stopping_rounds=50, verbose=False)
    
    # XGBRanker outputs scores, not probabilities
    val_preds = model.predict(X_val_fold)
    oof_predictions[val_idx] = val_preds

    # --- Calculate and store MAP@7 for this fold ---
    fold_map_score = map_at_k(y_val_fold, val_preds, df.iloc[val_idx]['group_id'], k=7)
    oof_map_scores.append(fold_map_score)
    print(f"Fold {fold+1} MAP@7 Score: {fold_map_score:.5f}")

    model_path = f'{WORKING_DIR}final_ranker_model_fold_{fold+1}.json'
    model.save_model(model_path)
    print(f"Model for fold {fold+1} saved to {model_path}")
    del X_train_fold, y_train_fold, X_val_fold, y_val_fold, model
    gc.collect()

# --- Evaluate Overall Performance ---
print(f"\n--- Overall Cross-Validation Results ---")
print(f"Mean Out-of-Fold (OOF) MAP@7 Score: {np.mean(oof_map_scores):.5f}")
print("This score is the most reliable estimate of your final leaderboard performance.")


# =============================================================================
# STAGE 3: PREDICTION AND SUBMISSION FILE CREATION
# =============================================================================
print("\n--- STAGE 3: Generating Final Predictions ---")
# Prepare test data
submission_ids = df_test[['id1', 'id2', 'id3', 'id5']].copy()
# Align test columns with the training columns
X_test = df_test.reindex(columns=X.columns, fill_value=0)
# Final imputation for test data
X_test = X_test.fillna(-999)

# Load models and predict
test_predictions = np.zeros(len(X_test))
for fold in range(1, N_SPLITS + 1):
    print(f"Predicting with Fold {fold}/{N_SPLITS}...")
    model_path = f'{WORKING_DIR}final_ranker_model_fold_{fold}.json'
    model = xgb.XGBRanker()
    model.load_model(model_path)
    # Predict outputs ranking scores
    test_predictions += model.predict(X_test) / N_SPLITS

# Create submission file
print("\nCreating submission file...")
submission_df = submission_ids.copy()
submission_df['id5'] = pd.to_datetime(submission_df['id5'], errors='coerce').dt.strftime('%m-%d-%Y')
cleaned_predictions = np.nan_to_num(test_predictions, nan=-999) # Use a low score for NaNs
submission_df['pred'] = cleaned_predictions

submission_path = f'{WORKING_DIR}r2_submission_final_ranker18.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission file successfully saved to: {submission_path}")
print("\nFirst 5 rows of the submission file:")
print(submission_df.head())

--- STAGE 1: Loading and Preparing Data for XGBRanker ---
Successfully loaded final training (v92) and test (v92) data.
Preprocessing data for training...
Converting all feature columns to numeric types...

Optimizing memory usage for training data...
Memory usage of dataframe is 2367.98 MB
Memory usage after optimization is: 1217.04 MB
Decreased by 48.6%

Optimizing memory usage for test data...
Memory usage of dataframe is 1132.30 MB
Memory usage after optimization is: 600.84 MB
Decreased by 46.9%
Prepared training data with 439 features and 52468 groups.

--- STAGE 2: Training Final Ensemble of XGBRanker Models ---
Starting final training with 3-Fold GroupKFold on GPU...
--- Fold 1/3 ---
Fold 1 MAP@7 Score: 0.06282
Model for fold 1 saved to /kaggle/working/final_ranker_model_fold_1.json
--- Fold 2/3 ---
Fold 2 MAP@7 Score: 0.06610
Model for fold 2 saved to /kaggle/working/final_ranker_model_fold_2.json
--- Fold 3/3 ---
Fold 3 MAP@7 Score: 0.06171
Model for fold 3 saved to /kaggle/wo

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD THE SUBMISSION FILE
# =============================================================================
print("--- STAGE 1: Loading the submission file ---")
try:
    submission_path = f'{WORKING_DIR}r2_submission_final_ranker18.csv'
    df_sub = pd.read_csv(submission_path)
    print("Successfully loaded submission file.")
    print("\nOriginal 'pred' column stats:")
    print(df_sub['pred'].describe())
except FileNotFoundError:
    print(f"Error: {submission_path} not found.")
    print("Please ensure you have run the prediction script successfully.")
    exit()

# =============================================================================
# STAGE 2: NORMALIZE THE PREDICTION COLUMN
# =============================================================================
print("\n--- STAGE 2: Normalizing the 'pred' column ---")

# Initialize the Min-Max Scaler to scale values between 0 and 1
scaler = MinMaxScaler()

# The scaler expects a 2D array, so we need to reshape the column
# We fit and transform in one step
df_sub['pred_normalized'] = scaler.fit_transform(df_sub[['pred']])

print("\n'pred' column has been normalized.")
print("\nNew 'pred_normalized' column stats:")
print(df_sub['pred_normalized'].describe())


# =============================================================================
# STAGE 3: CREATE AND SAVE THE NORMALIZED SUBMISSION FILE
# =============================================================================
print("\n--- STAGE 3: Saving the normalized submission file ---")

# Create the final dataframe, dropping the old prediction column
df_final_sub = df_sub[['id1', 'id2', 'id3', 'id5']].copy()
df_final_sub['pred'] = df_sub['pred_normalized']

# Define the output path
output_path = f'{WORKING_DIR}r2_submission_final_ranker_normalized18.csv'

try:
    df_final_sub.to_csv(output_path, index=False)
    print(f"\nSuccessfully saved normalized submission file to: {output_path}")
    print("\nFirst 5 rows of the new submission file:")
    print(df_final_sub.head())
except Exception as e:
    print(f"\nError saving the file: {e}")

In [None]:
# =============================================================================
# SECTION 0: SETUP AND IMPORTS
# =============================================================================
print("--- Section 0: Setting up the environment ---")
# Install the necessary libraries for graph and sequential embeddings
!pip install -q node2vec gensim

import pandas as pd
import numpy as np
import dask.dataframe as dd
import warnings
import gc
import os
import networkx as nx
from node2vec import Node2Vec
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Define Kaggle File Paths ---
INPUT_DIR = '/kaggle/input/amexkkgp/'
WORKING_DIR = '/kaggle/working/'

# =============================================================================
# STAGE 1: LOAD DATA
# =============================================================================
print("\n--- Section 1: Loading Data ---")
try:
    # Load your latest training and test sets
    df_train = pd.read_parquet(f'/kaggle/input/amex-v15/train_data_v15.parquet')
    df_test = pd.read_parquet(f'/kaggle/input/amex-v15/test_data_v15.parquet')
    
    # Use Dask for the large event log
    event_dd = dd.read_parquet(f'{INPUT_DIR}add_event.parquet', columns=['id2', 'id3', 'id4'])
    
    print("Successfully loaded all required data.")
except FileNotFoundError:
    print("Error: Required data not found. Please ensure all necessary files are in the correct directories.")
    exit()

# =============================================================================
# TECHNIQUE 1: GRAPH EMBEDDINGS (NODEVEC) WITH PCA
# =============================================================================
print("\n--- Technique 1: Creating Graph Embedding Features ---")

# --- Step 1a: Build the Interaction Graph on a Sample ---
print("Step 1a: Building the customer-offer interaction graph on a sample...")
# Use a large sample of the event data to make the process much faster
event_dd_sample = event_dd.sample(frac=0.3, random_state=42)
edge_list = event_dd_sample[['id2', 'id3']].compute()
# Create a graph from the edge list
G = nx.from_pandas_edgelist(edge_list, 'id2', 'id3')
print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

# --- Step 1b: Train the Node2Vec Model ---
print("Step 1b: Training the Node2Vec model...")
node2vec = Node2Vec(G, dimensions=32, walk_length=20, num_walks=10, workers=4, quiet=True)
# min_count=5 acts as a form of pruning for robustness.
model_n2v = node2vec.fit(window=5, min_count=5, batch_words=4)
print("Node2Vec model trained successfully.")

# --- Step 1c: Extract Offer Embeddings ---
print("Step 1c: Extracting offer embeddings...")
offer_ids = df_train['id3'].unique().tolist() + df_test['id3'].unique().tolist()
offer_ids = list(set(offer_ids)) # Get unique offer IDs across train and test

graph_embeddings = []
for offer_id in offer_ids:
    try:
        vector = model_n2v.wv[offer_id]
        graph_embeddings.append([offer_id] + vector.tolist())
    except KeyError:
        continue

graph_embedding_features = pd.DataFrame(graph_embeddings, columns=['id3'] + [f'graph_emb_{i}' for i in range(32)])
print("Graph embedding features created successfully.")

# Clean up memory
del G, edge_list, model_n2v, node2vec, event_dd_sample
gc.collect()

# =============================================================================
# TECHNIQUE 2: SEQUENTIAL EMBEDDINGS (WORD2VEC) WITH PCA
# =============================================================================
print("\n--- Technique 2: Creating Sequential Embedding Features ---")

# --- Step 2a: Create Customer Sessions ---
print("Step 2a: Creating customer sessions...")
event_dd['impression_time'] = dd.to_datetime(event_dd['id4'], errors='coerce')

# MODIFIED: Use a more robust Dask method to create sessions by sorting first.
print("Sorting events by customer and time...")
event_dd = event_dd.sort_values(['id2', 'impression_time'])

# Define a function to calculate time differences within partitions
def calculate_time_diff(df):
    df['time_diff'] = df.groupby('id2')['impression_time'].diff().dt.total_seconds()
    return df

# Apply the function to each partition using map_partitions
meta = event_dd._meta.copy()
meta['time_diff'] = pd.Series(dtype='float64')
event_dd = event_dd.map_partitions(calculate_time_diff, meta=meta)

# A new session starts if the time difference is > 30 minutes (1800 seconds)
event_dd['new_session'] = (event_dd['time_diff'] > 1800).fillna(True)
event_dd['session_id'] = event_dd.groupby('id2')['new_session'].cumsum()

# Create the list of sessions (sequences of offers)
print("Aggregating offers into sessions...")
sessions = event_dd.groupby(['id2', 'session_id'])['id3'].apply(list, meta=('id3', 'object')).compute().tolist()
print(f"Created {len(sessions)} customer sessions.")

# --- Step 2b: Train the Word2Vec Model ---
print("Step 2b: Training the Word2Vec model...")
# Train the model on the sessions. min_count=5 acts as pruning.
model_w2v = Word2Vec(sentences=sessions, vector_size=32, window=5, min_count=5, workers=4)
print("Word2Vec model trained successfully.")

# --- Step 2c: Extract Offer Embeddings ---
print("Step 2c: Extracting sequential embeddings...")
sequential_embeddings = []
for offer_id in offer_ids:
    try:
        vector = model_w2v.wv[offer_id]
        sequential_embeddings.append([offer_id] + vector.tolist())
    except KeyError:
        continue

sequential_embedding_features = pd.DataFrame(sequential_embeddings, columns=['id3'] + [f'seq_emb_{i}' for i in range(32)])
print("Sequential embedding features created successfully.")

# Clean up memory
del sessions, model_w2v, event_dd
gc.collect()

# =============================================================================
# STAGE 3: APPLY PCA AND MERGE FEATURES
# =============================================================================
print("\n--- STAGE 3: Applying PCA and Merging Final Features ---")

def apply_pca_to_embeddings(df_features, prefix):
    """Applies PCA to a dataframe of embedding features."""
    ids = df_features[['id3']]
    embeddings = df_features.drop(columns=['id3'])
    
    # Scale the data before PCA
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings)
    
    # Use PCA to capture 95% of the variance
    pca = PCA(n_components=0.95)
    embeddings_pca = pca.fit_transform(embeddings_scaled)
    print(f"PCA for '{prefix}' completed. New shape: {embeddings_pca.shape}")
    
    # Create a DataFrame from the PCA components
    pca_df = pd.DataFrame(embeddings_pca, columns=[f'{prefix}_pca_{i}' for i in range(embeddings_pca.shape[1])])
    return pd.concat([ids, pca_df], axis=1)

# --- Apply PCA to both sets of embeddings ---
graph_pca_features = apply_pca_to_embeddings(graph_embedding_features, 'graph')
sequential_pca_features = apply_pca_to_embeddings(sequential_embedding_features, 'seq')

def enrich_dataframe(df, graph_pca_features, sequential_pca_features):
    """A reusable function to enrich a dataframe with new PCA features."""
    df['id3'] = df['id3'].astype(str)
    graph_pca_features['id3'] = graph_pca_features['id3'].astype(str)
    sequential_pca_features['id3'] = sequential_pca_features['id3'].astype(str)
    
    # Merge graph PCA features
    df = pd.merge(df, graph_pca_features, on='id3', how='left')
    # Merge sequential PCA features
    df = pd.merge(df, sequential_pca_features, on='id3', how='left')
    
    return df

# --- Enrich both the training and test dataframes ---
print("Enriching training data...")
df_train_final = enrich_dataframe(df_train, graph_pca_features, sequential_pca_features)
print("Enriching test data...")
df_test_final = enrich_dataframe(df_test, graph_pca_features, sequential_pca_features)

# --- Save the final DataFrames ---
try:
    train_output_path = f'{WORKING_DIR}train_data_v18.parquet'
    test_output_path = f'{WORKING_DIR}test_data_v18.parquet'
    
    df_train_final.to_parquet(train_output_path)
    df_test_final.to_parquet(test_output_path)
    
    print(f"\nSuccessfully saved final training data to: {train_output_path}")
    print(f"Successfully saved final test data to: {test_output_path}")
except Exception as e:
    print(f"\nError saving the final files: {e}")

print("\n--- ADVANCED EMBEDDING FEATURE ENGINEERING COMPLETE ---")