<a href="https://colab.research.google.com/github/sadnyd/Synapses-25/blob/main/model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets scikit-learn matplotlib seaborn -q

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [4]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# --- 1. Data Loading ---
def load_criteo_data():
    """Loads the Criteo Uplift Prediction dataset."""
    print("Loading Criteo dataset...")
    # Using a smaller subset for faster demonstration if needed.
    # Remove split='train[:1%]' for full dataset.
    try:
         # ds = load_dataset("criteo/criteo-uplift", split='train[:5%]') # Load a subset for speed
         ds = load_dataset("criteo/criteo-uplift", split='train') # Load full dataset
         print("Dataset loaded successfully.")
         return ds.to_pandas()
    except Exception as e:
         print(f"Error loading dataset: {e}")
         print("Please ensure the 'datasets' library is installed and you have internet access.")
         return None

def get_feature_columns():
    """Returns the list of feature column names."""
    return [f'f{i}' for i in range(12)]

In [6]:

# --- 2. Preprocessing and Sanity Checks ---
def normalize_features(df_train, df_test, features):
    """Normalizes features using StandardScaler."""
    print("Normalizing features...")
    scaler = StandardScaler()
    df_train[features] = scaler.fit_transform(df_train[features])
    df_test[features] = scaler.transform(df_test[features])
    print("Features normalized.")
    return df_train, df_test

In [7]:

def check_treatment_independence(X, T, test_size=0.3, random_state=42):
    """
    Performs a Classifier Two-Sample Test (C2ST) to check T independence from X.
    Trains a classifier to predict Treatment (T) from Features (X).
    If AUC is close to 0.5, it suggests independence (cannot distinguish groups based on X).
    """
    print("Performing C2ST for treatment independence check...")
    X_train, X_test, T_train, T_test = train_test_split(X, T, test_size=test_size, stratify=T, random_state=random_state)

    # Use a simple model like Logistic Regression
    model = LogisticRegression(solver='liblinear', random_state=random_state)
    model.fit(X_train, T_train)

    T_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(T_test, T_pred_proba)

    # Baseline accuracy (predicting the majority class)
    majority_class_fraction = T.mean() if T.mean() > 0.5 else 1 - T.mean()

    print(f"C2ST Classifier AUC: {auc:.4f}")
    print(f"Baseline (Majority Class) Accuracy: {majority_class_fraction:.4f}")

    if abs(auc - 0.5) < 0.05: # Threshold can be adjusted
        print("C2ST Result: Treatment assignment appears independent of features (AUC close to 0.5).")
    else:
        print("C2ST Result: Treatment assignment might depend on features (AUC differs from 0.5). Caution advised.")
    return auc

In [8]:
# --- 3. Uplift Modeling Approaches ---

def two_model_approach(X_train, X_test, y_train, y_test, T_train, T_test, features, target_column):
    """
    Implements the Two-Model uplift approach.
    Trains separate models for treated and control groups.
    """
    print(f"\n--- Running Two-Model Approach for {target_column} ---")

    # Separate data based on treatment status IN THE TRAINING SET
    X_train_T = X_train[T_train == 1]
    y_train_T = y_train[T_train == 1]
    X_train_C = X_train[T_train == 0]
    y_train_C = y_train[T_train == 0]

    print(f"Training set sizes: Treated={len(X_train_T)}, Control={len(X_train_C)}")
    if len(X_train_T) == 0 or len(X_train_C) == 0:
        print("Error: Training data missing for one treatment group.")
        return None, None

    # Train model on Treated group
    model_T = LogisticRegression(solver='liblinear', random_state=42)
    model_T.fit(X_train_T[features], y_train_T)

    # Train model on Control group
    model_C = LogisticRegression(solver='liblinear', random_state=42)
    model_C.fit(X_train_C[features], y_train_C)

    # Predict probabilities on the ENTIRE test set using BOTH models
    p_T = model_T.predict_proba(X_test[features])[:, 1]
    p_C = model_C.predict_proba(X_test[features])[:, 1]

    # Calculate uplift score for EACH individual in the test set
    uplift_score = p_T - p_C

    # Prepare results dataframe including uplift scores and actuals
    results_df = X_test.copy()
    results_df['p_T'] = p_T
    results_df['p_C'] = p_C
    results_df['uplift_score'] = uplift_score
    results_df[target_column] = y_test # Add actual outcome
    results_df['treatment'] = T_test   # Add actual treatment

    print("Two-Model approach finished.")
    return uplift_score, results_df

In [9]:
def revert_label_approach(X_train, X_test, y_train, y_test, T_train, T_test, features, target_column):
    """
    Implements the Revert Label / Class Transformation approach.
    Transforms the target based on treatment and outcome, trains one model.
    """
    print(f"\n--- Running Revert-Label Approach for {target_column} ---")

    # Create the transformed target variable Z
    # Z = 1 if (T=1 and Y=1) OR (T=0 and Y=0)
    # Z = 0 otherwise
    Z_train = np.zeros_like(y_train)
    Z_train[ (T_train == 1) & (y_train == 1) ] = 1
    Z_train[ (T_train == 0) & (y_train == 0) ] = 1

    # Train a single model to predict Z
    model_Z = LogisticRegression(solver='liblinear', random_state=42)
    model_Z.fit(X_train[features], Z_train)

    # Predict P(Z=1|X) on the ENTIRE test set
    p_Z = model_Z.predict_proba(X_test[features])[:, 1]

    # Calculate uplift score using the formula from the paper (adjust if needed for severe imbalance)
    # Uplift = P(Y=1|T=1) - P(Y=1|T=0) = 2 * P(Z=1|X) - 1
    uplift_score = 2 * p_Z - 1

    # Prepare results dataframe including uplift scores and actuals
    results_df = X_test.copy()
    results_df['p_Z'] = p_Z
    results_df['uplift_score'] = uplift_score
    results_df[target_column] = y_test # Add actual outcome
    results_df['treatment'] = T_test   # Add actual treatment

    print("Revert-Label approach finished.")
    return uplift_score, results_df

In [10]:
# --- 4. Evaluation ---

def calculate_qini_auuc(results_df, target_column):
    """
    Calculates Qini curve, Qini coefficient, Uplift curve, and AUUC.

    Args:
        results_df (pd.DataFrame): DataFrame containing 'uplift_score',
                                   'treatment', and the target_column (actual outcome).
    Returns:
        tuple: (qini_curve_df, qini_coeff, uplift_curve_df, auuc)
    """
    print("Calculating Qini and AUUC...")

    # Sort by predicted uplift score in descending order
    sorted_df = results_df.sort_values(by='uplift_score', ascending=False).reset_index(drop=True)

    N = len(sorted_df)
    N_T = sorted_df['treatment'].sum() # Total treated in test set
    N_C = N - N_T                    # Total control in test set

    if N_T == 0 or N_C == 0:
        print("Error: Test set missing treated or control group for evaluation.")
        return None, 0, None, 0

    # Cumulative counts and outcomes
    sorted_df['n'] = 1 # Counter for cumulative population
    sorted_df['cum_n'] = sorted_df['n'].cumsum()
    sorted_df['cum_n_T'] = sorted_df['treatment'].cumsum()
    # Ensure cum_n_C does not start at 0 to avoid division by zero
    sorted_df['cum_n_C'] = (1 - sorted_df['treatment']).cumsum().replace(0, 1e-9) # Add small epsilon

    # Cumulative outcomes for treated and control
    sorted_df['cum_y_T'] = (sorted_df[target_column] * sorted_df['treatment']).cumsum()
    sorted_df['cum_y_C'] = (sorted_df[target_column] * (1 - sorted_df['treatment'])).cumsum()

    # --- Qini Calculation ---
    # Incremental gain: (Responders_T) - (Responders_C * Ratio of Treated/Control seen so far)
    # The ratio N_T / N_C in the paper's formula refers to the overall ratio in the population
    # For the curve calculation, we use the ratio observed up to point k
    # qini_numerator = sorted_df['cum_y_T'] - sorted_df['cum_y_C'] * (sorted_df['cum_n_T'] / sorted_df['cum_n_C'])
    # Revised Qini based on Radcliffe 2007 and Gutierrez 2017 definition:
    qini_numerator = sorted_df['cum_y_T'] - sorted_df['cum_y_C'] * (N_T / N_C)

    # --- Uplift (AUUC) Calculation ---
    # Difference in response rates between T and C groups seen so far
    # Avoid division by zero if cum_n_T is zero initially
    response_rate_T = (sorted_df['cum_y_T'] / sorted_df['cum_n_T'].replace(0, 1e-9))
    response_rate_C = (sorted_df['cum_y_C'] / sorted_df['cum_n_C']) # Already handled division by zero
    uplift_curve_values = (response_rate_T - response_rate_C) * (sorted_df['cum_n'] / N) # Weighted by population fraction

    # Create curve dataframes
    qini_curve_df = pd.DataFrame({
        'population_fraction': sorted_df['cum_n'] / N,
        'qini_value': qini_numerator
    })

    uplift_curve_df = pd.DataFrame({
        'population_fraction': sorted_df['cum_n'] / N,
        'uplift_value': uplift_curve_values
    })

    # Add random baseline to Qini curve
    total_increment = qini_numerator.iloc[-1]
    qini_curve_df['random_baseline'] = qini_curve_df['population_fraction'] * total_increment

    # --- Calculate Coefficients (Area under curve) ---
    # Use trapezoidal rule for integration

    # Qini Coefficient (Normalized version sometimes used, but AUGC is common)
    # Area under the Qini curve minus area under the random baseline
    auqc = np.trapz(qini_curve_df['qini_value'], x=qini_curve_df['population_fraction'])
    random_qini_area = np.trapz(qini_curve_df['random_baseline'], x=qini_curve_df['population_fraction'])
    qini_coeff = auqc - random_qini_area # Area Between Curves

    # AUUC (Area under the uplift curve)
    auuc = np.trapz(uplift_curve_df['uplift_value'], x=uplift_curve_df['population_fraction'])

    print(f"Calculation complete: Qini Coeff = {qini_coeff:.4f}, AUUC = {auuc:.4f}")
    return qini_curve_df, qini_coeff, uplift_curve_df, auuc

In [11]:
def plot_uplift_curves(qini_curve_df, qini_coeff, uplift_curve_df, auuc, model_name, target_column):
    """Plots the Qini and Uplift (AUUC) curves."""

    plt.figure(figsize=(14, 6))

    # Qini Curve Plot
    plt.subplot(1, 2, 1)
    plt.plot(qini_curve_df['population_fraction'], qini_curve_df['qini_value'], label=f'Qini Curve ({model_name})')
    plt.plot(qini_curve_df['population_fraction'], qini_curve_df['random_baseline'], 'k--', label='Random Baseline')
    plt.xlabel('Proportion of Population Targeted (Sorted by Uplift)')
    plt.ylabel('Cumulative Incremental Outcome')
    plt.title(f'Qini Curve - {target_column.capitalize()}\nQini Coefficient: {qini_coeff:.4f}')
    plt.legend()
    plt.grid(True)

    # Uplift Curve (AUUC) Plot
    plt.subplot(1, 2, 2)
    plt.plot(uplift_curve_df['population_fraction'], uplift_curve_df['uplift_value'], label=f'Uplift Curve ({model_name})')
    plt.plot([0, 1], [0, 0], 'k--', label='Random Baseline') # Baseline for AUUC is 0
    plt.xlabel('Proportion of Population Targeted (Sorted by Uplift)')
    plt.ylabel('Cumulative Uplift (Avg Outcome T - Avg Outcome C)')
    plt.title(f'Uplift Curve - {target_column.capitalize()}\nAUUC: {auuc:.4f}')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [13]:
df = load_criteo_data()

Loading Criteo dataset...
Dataset loaded successfully.


In [16]:
features = get_feature_columns()
target_visit = 'visit'
target_conversion = 'conversion' # Focus on conversion as in paper's later experiments
treatment_col = 'treatment'

print("\n--- Data Splitting ---")


--- Data Splitting ---


In [17]:
 # Split into train/test FIRST, then apply normalization/checks
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42, stratify=df[treatment_col]) # Stratify by treatment
print(f"Train set size: {len(df_train)}, Test set size: {len(df_test)}")

Train set size: 9785714, Test set size: 4193878


In [18]:
 # --- Preprocessing & Checks ---
df_train, df_test = normalize_features(df_train, df_test, features)
check_treatment_independence(df_train[features], df_train[treatment_col]) # Check on training data

Normalizing features...
Features normalized.
Performing C2ST for treatment independence check...
C2ST Classifier AUC: 0.5099
Baseline (Majority Class) Accuracy: 0.8500
C2ST Result: Treatment assignment appears independent of features (AUC close to 0.5).


np.float64(0.5099141126175784)

In [19]:
# Define common variables for models
TARGET = target_conversion # Choose target: 'visit' or 'conversion'
X_train = df_train[features + [treatment_col]] # Keep treatment in X for revert label logic
y_train = df_train[TARGET]
T_train = df_train[treatment_col]
X_test = df_test[features + [treatment_col]] # Keep treatment info if needed, but models use features only
y_test = df_test[TARGET]
T_test = df_test[treatment_col]

In [None]:
uplift_2M, results_2M = two_model_approach(X_train, X_test, y_train, y_test, T_train, T_test, features, TARGET)
if results_2M is not None:
  qini_curve_2M, qini_coeff_2M, uplift_curve_2M, auuc_2M = calculate_qini_auuc(results_2M, TARGET)
  if qini_curve_2M is not None:
    plot_uplift_curves(qini_curve_2M, qini_coeff_2M, uplift_curve_2M, auuc_2M, "Two-Model", TARGET)


--- Running Two-Model Approach for conversion ---
Training set sizes: Treated=8317858, Control=1467856


In [None]:
 # 2. Revert Label Approach
uplift_RL, results_RL = revert_label_approach(X_train, X_test, y_train, y_test, T_train, T_test, features, TARGET)
if results_RL is not None:
  qini_curve_RL, qini_coeff_RL, uplift_curve_RL, auuc_RL = calculate_qini_auuc(results_RL, TARGET)
  if qini_curve_RL is not None:
    plot_uplift_curves(qini_curve_RL, qini_coeff_RL, uplift_curve_RL, auuc_RL, "Revert-Label", TARGET)
