In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shill-bidding-dataset1/Shill Bidding Dataset.csv


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
import pandas as pd, pathlib as pl

# Shill Bidding Data Set

In [3]:

DATA = pl.Path('/kaggle/input/shill-bidding-dataset1/Shill Bidding Dataset.csv')
df = pd.read_csv(DATA)
df.head()


Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,4,732,7***n,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,5,900,z***z,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


In [4]:
# 2 · Auction duration → seconds (days × 86400)
df['auction_duration_sec'] = df['Auction_Duration'] * 24 * 3600

# 3 · Timing features (fixed)
#    - time_from_start_sec: when they first bid
#    - time_to_close_sec:   how much time *remains* after their last bid
df['time_from_start_sec'] = df['Early_Bidding'] * df['auction_duration_sec']
df['time_to_close_sec']   = (1 - df['Last_Bidding']) * df['auction_duration_sec']

# 4 · Burstiness & bid‑share
df['burstiness_rate'] = df['Successive_Outbidding']
df['bid_count_share'] = df['Bidding_Ratio']

# 5 · Win orientation
df['win_ratio_running'] = df['Winning_Ratio']

# 6 · Experience (cum. auctions per bidder)
df = df.sort_values(['Bidder_ID', 'Record_ID'])
df['exp_auctions'] = df.groupby('Bidder_ID').cumcount()

# 7 · Collect features
feature_cols = [
    'Record_ID','Auction_ID','Bidder_ID',
    'time_from_start_sec','time_to_close_sec',
    'burstiness_rate','bid_count_share',
    'win_ratio_running','exp_auctions'
]
features = df[feature_cols]

# 8 · Save & verify
OUT = '/kaggle/working/features_step3_fixed.csv'
features.to_csv(OUT, index=False)
print("✔︎  Fixed features saved to", OUT)
display(features.head())

✔︎  Fixed features saved to /kaggle/working/features_step3_fixed.csv


Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,time_from_start_sec,time_to_close_sec,burstiness_rate,bid_count_share,win_ratio_running,exp_auctions
3021,7205,311,*****,138546.000029,466253.999971,0.0,0.032258,0.0,0
5249,12560,161,*****,149676.999993,109523.000007,0.0,0.043478,0.0,1
5950,14260,330,*****,163020.99998,441779.00002,0.0,0.055556,0.0,2
1888,4516,899,****h,6.999998,431993.000002,0.0,0.2,1.0,0
886,2062,224,****y,437565.999992,71275.000023,0.0,0.181818,1.0,0


# Loading Engineered Features and Original Auction Bids

In [5]:

# Load your engineered features
feat_path = Path('/kaggle/working/features_step3_fixed.csv')
df_feat   = pd.read_csv(feat_path)
print("Loaded features from:", feat_path)

# Load the original Auction_Bids to get per-bidder counts
raw_path = Path('/kaggle/input/shill-bidding-dataset1/Shill Bidding Dataset.csv')
df_raw   = pd.read_csv(raw_path)
print("Loaded raw data from:", raw_path)


Loaded features from: /kaggle/working/features_step3_fixed.csv
Loaded raw data from: /kaggle/input/shill-bidding-dataset1/Shill Bidding Dataset.csv


In [6]:
# Merge in bid_count
df = df_feat.merge(df_raw[['Record_ID','Auction_Bids']], on='Record_ID')
df.rename(columns={'Auction_Bids':'bid_count'}, inplace=True)


In [7]:
# 3 · Rule‑based sniper
df['strategy'] = 'unlabeled'
sniper_mask   = (df['time_to_close_sec'] <= 10) & (df['bid_count'] == 1)
df.loc[sniper_mask, 'strategy'] = 'sniper'

In [8]:
# 4 · Cluster the unlabeled on the key behavioural features
features = ['time_to_close_sec','burstiness_rate','bid_count_share',
            'win_ratio_running','exp_auctions']
X = df.loc[df['strategy']=='unlabeled', features].fillna(0)

kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto').fit(X)

df.loc[df['strategy']=='unlabeled','cluster'] = kmeans.labels_

In [9]:
# 5 · Inspect the centers
centers = pd.DataFrame(kmeans.cluster_centers_, columns=features)
print("\nCluster centers (clusters 0,1,2):\n", centers)


Cluster centers (clusters 0,1,2):
    time_to_close_sec  burstiness_rate  bid_count_share  win_ratio_running  \
0      533062.193344         0.074632         0.131699           0.392713   
1       48282.634776         0.119951         0.121119           0.346651   
2      248956.469317         0.098169         0.137280           0.387328   

   exp_auctions  
0      7.126506  
1      8.968239  
2      7.560606  


In [10]:
# … after you print the centers …

# 6 · Map clusters → strategies (updated)
mapping = {
    0: 'incrementalist',   # bids far from close
    1: 'sniper',           # bids right at the end
    2: 'jump_bidder'       # intermediate timing
}

df.loc[df['strategy']=='unlabeled','strategy'] = \
    df.loc[df['strategy']=='unlabeled','cluster'].map(mapping)


In [11]:
# 7 · Save final labels
out_path = Path('/kaggle/working/step4_strategy_labels.csv')
df[['Record_ID','strategy']].to_csv(out_path, index=False)
print(f"\n✔︎  Saved strategy labels to {out_path}")
print(df['strategy'].value_counts())



✔︎  Saved strategy labels to /kaggle/working/step4_strategy_labels.csv
strategy
sniper            3243
jump_bidder       1586
incrementalist    1492
Name: count, dtype: int64


In [12]:

# 1 · Load your engineered features & the labeled strategies
feat = pd.read_csv('/kaggle/working/features_step3_fixed.csv')
labs = pd.read_csv('/kaggle/working/step4_strategy_labels.csv')

# 2 · Merge them on Record_ID
df = feat.merge(labs, on='Record_ID')

# 3 · Define your set of alternatives
alts = ['sniper','jump_bidder','incrementalist']
alts_df = pd.DataFrame({'alt': alts})

# 4 · Cross‐join to get one row per (Record_ID × alt)
long = df.merge(alts_df, how='cross')

# 5 · Create the ‘chosen’ indicator
long['chosen'] = (long['strategy'] == long['alt']).astype(int)

# 6 · Rename columns for the estimator
long.rename(columns={
    'Bidder_ID': 'chooser_id',
    'Record_ID': 'choice_id'
}, inplace=True)

# 7 · Select only the columns your model needs
covariates = [
    'time_from_start_sec','time_to_close_sec',
    'burstiness_rate','bid_count_share',
    'win_ratio_running','exp_auctions'
]
final_cols = ['chooser_id','choice_id','alt','chosen'] + covariates
long_df = long[final_cols]

# 8 · Save
out_path = Path('/kaggle/working/step5_long_format.csv')
long_df.to_csv(out_path, index=False)
print("✔︎  Long‐format table saved to", out_path)
print("Example rows:\n", long_df.head(9))


✔︎  Long‐format table saved to /kaggle/working/step5_long_format.csv
Example rows:
   chooser_id  choice_id             alt  chosen  time_from_start_sec  \
0      *****       7205          sniper       0        138546.000029   
1      *****       7205     jump_bidder       0        138546.000029   
2      *****       7205  incrementalist       1        138546.000029   
3      *****      12560          sniper       1        149676.999993   
4      *****      12560     jump_bidder       0        149676.999993   
5      *****      12560  incrementalist       0        149676.999993   
6      *****      14260          sniper       0        163020.999980   
7      *****      14260     jump_bidder       0        163020.999980   
8      *****      14260  incrementalist       1        163020.999980   

   time_to_close_sec  burstiness_rate  bid_count_share  win_ratio_running  \
0      466253.999971              0.0         0.032258                0.0   
1      466253.999971              0.0    

# MNL Estimates

In [13]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [14]:
df = pd.read_csv('/kaggle/working/step5_long_format.csv')

chosen = df[df['chosen'] == 1].copy()

code_map = {'incrementalist': 0, 'sniper': 1, 'jump_bidder': 2}
chosen['y'] = chosen['alt'].map(code_map)

X = chosen[
    ['time_from_start_sec',
     'time_to_close_sec',
     'burstiness_rate',
     'bid_count_share',
     'win_ratio_running',
     'exp_auctions']
]
X = sm.add_constant(X)   # adds alternative‐specific intercepts automatically

y = chosen['y']

scaler = StandardScaler()
Xs = scaler.fit_transform(X)
X_tr, X_te, y_tr, y_te = train_test_split(Xs, y, stratify=y, test_size=0.2, random_state=42)

In [15]:
clf = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    C=1.0,           # inverse regularization strength; smaller = more penalty
    max_iter=200
)
clf.fit(X_tr, y_tr)

classes = ['incrementalist','sniper','jump_bidder']

# 1) Grab the raw intercepts
intercepts = pd.Series(clf.intercept_, index=classes, name='intercept')
print("Raw intercepts:\n", intercepts, "\n")

# 2) Re‑base so incrementalist = 0
base = intercepts['incrementalist']
alt_constants = intercepts - base
print("Alt‑specific constants (incrementalist=0):\n", alt_constants)


Raw intercepts:
 incrementalist   -4.298542
sniper           -0.024889
jump_bidder       4.323432
Name: intercept, dtype: float64 

Alt‑specific constants (incrementalist=0):
 incrementalist    0.000000
sniper            4.273653
jump_bidder       8.621974
Name: intercept, dtype: float64


In [16]:
# 6 · Evaluate
train_acc = clf.score(X_tr, y_tr)
test_acc  = clf.score(X_te, y_te)
y_pred    = clf.predict(X_te)
f1        = f1_score(y_te, y_pred, average='weighted')
cm        = confusion_matrix(y_te, y_pred)

print(f"Train accuracy: {train_acc:.3f}")
print(f"Test  accuracy: {test_acc:.3f}")
print(f"Weighted F1-score: {f1:.3f}")
print("Confusion matrix (rows=true, cols=pred):\n", cm)

Train accuracy: 0.993
Test  accuracy: 0.995
Weighted F1-score: 0.995
Confusion matrix (rows=true, cols=pred):
 [[299   0   0]
 [  0 649   0]
 [  6   0 311]]


In [17]:
    # 3) Build your slope DataFrame
    coef_df = pd.DataFrame(clf.coef_, index=classes, columns=X.columns).T
    
    # 4) Append the intercepts as a “const” row
    coef_df.loc['const'] = intercepts
    
    print("Full parameter table:\n", coef_df)


Full parameter table:
                      incrementalist     sniper  jump_bidder
const                     -4.298542  -0.024889     4.323432
time_from_start_sec       -0.552404   0.322558     0.229846
time_to_close_sec         11.397543 -12.623385     1.225842
burstiness_rate           -0.029117   0.097415    -0.068298
bid_count_share           -0.121547   0.020017     0.101531
win_ratio_running          0.035006   0.011558    -0.046564
exp_auctions              -0.052315   0.015824     0.036491


# Mixed Logit/ Random-Parameters Logit 

In [18]:
pip install pylogit


Collecting pylogit
  Downloading pylogit-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Downloading pylogit-1.0.1-py3-none-any.whl (151 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.4/151.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pylogit
Successfully installed pylogit-1.0.1
Note: you may need to restart the kernel to use updated packages.


## MMNL

In [19]:
# Patch pylogit to fix ImportError in Python 3.10+
import os

file_path = '/usr/local/lib/python3.11/dist-packages/pylogit/choice_tools.py'

# Read file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Replace wrong import
new_lines = []
for line in lines:
    if 'from collections import Iterable' in line:
        new_lines.append('from collections.abc import Iterable\n')
    else:
        new_lines.append(line)

# Write back
with open(file_path, 'w') as file:
    file.writelines(new_lines)

print('✅ Patch applied! Try importing pylogit again.')


✅ Patch applied! Try importing pylogit again.


In [20]:
# 1 Setup ────────────────────────────────────────────────────────────
import pandas as pd, numpy as np, pylogit as pl
from collections import OrderedDict
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
import collections
import collections.abc

collections.Iterable = collections.abc.Iterable

df = pd.read_csv('/kaggle/working/step5_long_format.csv')

In [21]:
FILE = "/kaggle/working/step5_long_format.csv"     # <- your upload
RANDOM_SEED = 42
DRAWS       = 2_000                    # ↑ for publication


In [22]:
# 1. ─────────────── Load & basic prep ─────────────────────────────────
df = pd.read_csv(FILE)

alt_map = {'sniper': 1, 'jump_bidder': 2, 'incrementalist': 3}
df['alt_id'] = df['alt'].map(alt_map)

# Dummy ASCs (create *before* any splitting so they exist everywhere)
df['ASC_sniper'] = (df['alt_id'] == 1).astype(int)
df['ASC_jump']   = (df['alt_id'] == 2).astype(int)
# (incrementalist is base → no dummy)

In [23]:
# 2. ─────────────── Train / test split (whole auctions) ──────────────
gss = GroupShuffleSplit(test_size=0.30, n_splits=1, random_state=RANDOM_SEED)
train_idx, test_idx = next(gss.split(df, groups=df['choice_id']))
train, test = df.iloc[train_idx].copy(), df.iloc[test_idx].copy()


In [24]:
# Keep entire choice sets contiguous
train = train.sort_values(['choice_id','alt_id']).reset_index(drop=True)
test  = test.sort_values(['choice_id','alt_id']).reset_index(drop=True)


In [25]:
# 3. ─────────────── Scale continuous covariates (helps optimiser) ───
scale_cols = ['time_from_start_sec','time_to_close_sec',
              'burstiness_rate','bid_count_share',
              'win_ratio_running','exp_auctions']
scaler = StandardScaler().fit(train[scale_cols])
train[scale_cols] = scaler.transform(train[scale_cols])
test[scale_cols]  = scaler.transform(test[scale_cols])


In [26]:
# 4. ─────────────── PyLogit specification dict (alt-ID lists!) ──────
all_alts = [1, 2, 3]

spec = OrderedDict([
    ('time_from_start_sec', [all_alts]),  # generic β
    ('time_to_close_sec'  , [all_alts]),
    ('burstiness_rate'    , [all_alts]),
    ('bid_count_share'    , [all_alts]),
    ('win_ratio_running'  , [all_alts]),
    ('exp_auctions'       , [all_alts]),
    ('ASC_sniper'         , [all_alts]),  # generic × dummy
    ('ASC_jump'           , [all_alts]),
])
names = OrderedDict([
    ('time_from_start_sec', ['time_start']),    # one β → one label
    ('time_to_close_sec'  , ['time_close']),
    ('burstiness_rate'    , ['burst']),
    ('bid_count_share'    , ['bid_share']),
    ('win_ratio_running'  , ['win_ratio']),
    ('exp_auctions'       , ['experience']),
    ('ASC_sniper'         , ['ASC_sniper']),
    ('ASC_jump'           , ['ASC_jump']),
])

In [27]:
# 5. ─────────────── Build & estimate Mixed Logit ─────────────────────
import numpy as np
model = pl.create_choice_model(
            data=train,
            alt_id_col   ='alt_id',
            obs_id_col   ='choice_id',
            choice_col   ='chosen',
            specification=spec,
            names        =names,
            model_type   ="Mixed Logit",
            mixing_id_col='chooser_id',
            mixing_vars  ={'ASC_sniper':'n'},   # random intercept
)
import numpy as np

# # after you do `model = pl.create_choice_model(...)`
# num_betas = len(specification)         # 8
# num_sds   = len(model.mixing_vars)     # 1
# K         = num_betas + num_sds        # 9

init_vals = np.zeros(9)

# Step 1: fit (as you did)
model.fit_mle(
    init_vals=init_vals,
    num_draws=DRAWS,
    seed=RANDOM_SEED,
    constrained_pos=None,
    print_res=True,
    just_point=False
)

# Step 2: directly ask model to print model fit summary:
print(model.get_statsmodels_summary())

# Step 3: (optional) access estimated parameters:
print(model.params)


Log-likelihood at zero: -4,860.2608
Initial Log-likelihood: -4,860.2608


  results = minimize(estimator.calc_neg_log_likelihood_and_neg_gradient,


Estimation Time for Point Estimation: 2.12 minutes.
Final log-likelihood: -4,526.5719
                     Mixed Logit Model Regression Results                     
Dep. Variable:                 chosen   No. Observations:                4,424
Model:              Mixed Logit Model   Df Residuals:                    4,415
Method:                           MLE   Df Model:                            9
Date:                Sun, 27 Apr 2025   Pseudo R-squ.:                   0.069
Time:                        05:33:59   Pseudo R-bar-squ.:               0.067
AIC:                        9,071.144   Log-Likelihood:             -4,526.572
BIC:                        9,128.697   LL-Null:                    -4,860.261
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
time_start       -1.503e-13   1.73e+13  -8.67e-27      1.000    -3.4e+13     3.4e+13
time_close        1.718e-13

  self._store_inferential_results(np.sqrt(np.diag(self.robust_cov)),
  cond2 = cond0 & (x <= _a)


In [28]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

# 1. Predict on train
probs_train = model.predict(data=train, num_draws=DRAWS, seed=RANDOM_SEED)
n_obs_train = train['choice_id'].nunique()
probs_train = probs_train.reshape((n_obs_train, 3))
predicted_train = np.argmax(probs_train, axis=1) + 1

# 2. Predict on test
probs_test = model.predict(data=test, num_draws=DRAWS, seed=RANDOM_SEED)
n_obs_test = test['choice_id'].nunique()
probs_test = probs_test.reshape((n_obs_test, 3))
predicted_test = np.argmax(probs_test, axis=1) + 1

# 3. True labels
true_train = train[['choice_id', 'alt_id', 'chosen']].query('chosen==1')['alt_id'].values
true_test = test[['choice_id', 'alt_id', 'chosen']].query('chosen==1')['alt_id'].values

# 4. Compute metrics
train_accuracy = accuracy_score(true_train, predicted_train)
test_accuracy  = accuracy_score(true_test, predicted_test)
f1 = f1_score(true_test, predicted_test, average='weighted')
conf_matrix = confusion_matrix(true_test, predicted_test, labels=[1,2,3])

# 5. Print nicely
print(f"Train Accuracy: {train_accuracy:.3f}")
print(f"Test  Accuracy: {test_accuracy:.3f}")
print(f"Weighted F1-score: {f1:.3f}")
print("\nConfusion matrix (rows=true, cols=pred):")
print(conf_matrix)


Train Accuracy: 0.514
Test  Accuracy: 0.510
Weighted F1-score: 0.345

Confusion matrix (rows=true, cols=pred):
[[968   0   0]
 [476   0   0]
 [453   0   0]]


In [29]:
import pylogit as pl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# 0. Load your clean dataset
df = pd.read_csv('/kaggle/working/step5_long_format.csv')

alt_map = {'sniper': 1, 'jump_bidder': 2, 'incrementalist': 3}
df['alt_id'] = df['alt'].map(alt_map)

# Dummy ASCs (create *before* any splitting so they exist everywhere)
df['ASC_sniper'] = (df['alt_id'] == 1).astype(int)
df['ASC_jump']   = (df['alt_id'] == 2).astype(int)
# (incrementalist is base → no dummy)
# 1. Sort properly (important for mixed logit)
df = df.sort_values(by=['choice_id', 'alt_id'])

# # 2. Split into train-test
# train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df['choice_id'])

# 2. ─────────────── Train / test split (whole auctions) ──────────────
gss = GroupShuffleSplit(test_size=0.30, n_splits=1, random_state=RANDOM_SEED)
train_idx, test_idx = next(gss.split(df, groups=df['choice_id']))
train, test = df.iloc[train_idx].copy(), df.iloc[test_idx].copy()

# Keep entire choice sets contiguous
train = train.sort_values(['choice_id','alt_id']).reset_index(drop=True)
test  = test.sort_values(['choice_id','alt_id']).reset_index(drop=True)


# 3. Create MMNL specification
specification = OrderedDict({
    'win_ratio_running': 'all_same',    # bidder-specific
    'exp_auctions': 'all_same',          # bidder-specific
    'ASC_sniper': 'all_same',            # manually created dummy
    'ASC_jump': 'all_same' 
})

names = OrderedDict({
    'win_ratio_running': 'win_ratio_running',
    'exp_auctions': 'exp_auctions',
    'ASC_sniper': 'ASC_sniper',
    'ASC_jump': 'ASC_jump'
})

# 4. Build the Mixed Logit Model
model = pl.create_choice_model(
    data=train,
    alt_id_col='alt_id',
    obs_id_col='choice_id',
    choice_col='chosen',
    specification=specification,
    model_type="Mixed Logit",
    names=names,
    mixing_id_col='chooser_id',
    mixing_vars={'ASC_sniper': 'n'}  # Random only on ASC_sniper
)

# 5. Estimate the model
DRAWS = 2000
RANDOM_SEED = 42
init_vals = np.zeros(5)  # 4 coefficients + 1 random sigma + 1 more if needed

model.fit_mle(
    init_vals=init_vals,
    num_draws=DRAWS,
    seed=RANDOM_SEED,
    constrained_pos=None,
    print_res=True,
    just_point=False
)

# 6. See the model summary
print(model.get_statsmodels_summary())


Log-likelihood at zero: -4,860.2608
Initial Log-likelihood: -4,860.2608


  results = minimize(estimator.calc_neg_log_likelihood_and_neg_gradient,


Estimation Time for Point Estimation: 1.81 minutes.
Final log-likelihood: -4,526.5719
                     Mixed Logit Model Regression Results                     
Dep. Variable:                 chosen   No. Observations:                4,424
Model:              Mixed Logit Model   Df Residuals:                    4,419
Method:                           MLE   Df Model:                            5
Date:                Sun, 27 Apr 2025   Pseudo R-squ.:                   0.069
Time:                        05:36:05   Pseudo R-bar-squ.:               0.068
AIC:                        9,063.144   Log-Likelihood:             -4,526.572
BIC:                        9,095.118   LL-Null:                    -4,860.261
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
win_ratio_running   6.05e-14   1.54e+13   3.93e-27      1.000   -3.02e+13    3.02e+13
exp_auctions       1.939

  self._store_inferential_results(np.sqrt(np.diag(self.robust_cov)),
  cond2 = cond0 & (x <= _a)


In [30]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

# 1. Predict on train
probs_train = model.predict(data=train, num_draws=DRAWS, seed=RANDOM_SEED)
n_obs_train = train['choice_id'].nunique()
probs_train = probs_train.reshape((n_obs_train, 3))
predicted_train = np.argmax(probs_train, axis=1) + 1

# 2. Predict on test
probs_test = model.predict(data=test, num_draws=DRAWS, seed=RANDOM_SEED)
n_obs_test = test['choice_id'].nunique()
probs_test = probs_test.reshape((n_obs_test, 3))
predicted_test = np.argmax(probs_test, axis=1) + 1

# 3. True labels
true_train = train[['choice_id', 'alt_id', 'chosen']].query('chosen==1')['alt_id'].values
true_test = test[['choice_id', 'alt_id', 'chosen']].query('chosen==1')['alt_id'].values

# 4. Compute metrics
train_accuracy = accuracy_score(true_train, predicted_train)
test_accuracy  = accuracy_score(true_test, predicted_test)
f1 = f1_score(true_test, predicted_test, average='weighted')
conf_matrix = confusion_matrix(true_test, predicted_test, labels=[1,2,3])

# 5. Print nicely
print(f"Train Accuracy: {train_accuracy:.3f}")
print(f"Test  Accuracy: {test_accuracy:.3f}")
print(f"Weighted F1-score: {f1:.3f}")
print("\nConfusion matrix (rows=true, cols=pred):")
print(conf_matrix)


Train Accuracy: 0.514
Test  Accuracy: 0.510
Weighted F1-score: 0.345

Confusion matrix (rows=true, cols=pred):
[[968   0   0]
 [476   0   0]
 [453   0   0]]


# Latent Class Model

In [31]:
import pandas as pd

# Load your data
df = pd.read_csv('/kaggle/working/step5_long_format.csv')




In [32]:
# Example: Keep only numeric columns for LCM
X = df.select_dtypes(include=['float64', 'int64']).copy()

# Fill missing values if necessary
X = X.fillna(0)


In [33]:
from sklearn.mixture import GaussianMixture

# Set number of latent classes
K = 3

# Initialize the model
gmm = GaussianMixture(n_components=K, covariance_type='full', random_state=42)

# Fit the model
gmm.fit(X)

# Predict the latent class for each bidder
latent_classes = gmm.predict(X)

# Add the predicted class to your dataframe
df['predicted_class'] = latent_classes


In [34]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Group by predicted class and take mean only on numeric columns
df.groupby('predicted_class')[numeric_cols].mean()


Unnamed: 0_level_0,choice_id,chosen,time_from_start_sec,time_to_close_sec,burstiness_rate,bid_count_share,win_ratio_running,exp_auctions,predicted_class
predicted_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,10148.315217,0.333333,216656.336957,174962.318839,0.369565,0.171183,0.352275,41.967391,0.0
1,7482.998448,0.333333,171816.583527,217302.076429,0.0,0.086063,0.282134,6.840931,1.0
2,7031.669663,0.333333,163820.4985,200944.657123,0.622472,0.355169,0.868317,5.459551,2.0


In [35]:
behavior_mapping = {
    0: 'Sniper',         # suppose 0 behaves like snipers
    1: 'Incrementalist', # suppose 1 behaves like incrementalists
    2: 'Jump Bidder'     # suppose 2 behaves like jump bidders
}

df['predicted_behavior'] = df['predicted_class'].map(behavior_mapping)


### Evalaution of LCM

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X_train)

train_preds = gmm.predict(X_train)
test_preds = gmm.predict(X_test)

print(f"Train BIC: {gmm.bic(X_train)}")
print(f"Test BIC: {gmm.bic(X_test)}")


Train BIC: 1077708.6893672706
Test BIC: 270918.29938348883


In [37]:
train_bic_per_sample = train_preds / len(X_train)
test_bic_per_sample = test_preds / len(X_test)

print(train_bic_per_sample, test_bic_per_sample)

[1.31839156e-04 6.59195781e-05 6.59195781e-05 ... 6.59195781e-05
 6.59195781e-05 6.59195781e-05] [0.         0.00026364 0.         ... 0.00026364 0.00026364 0.00026364]
