In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df_train=pd.read_parquet('train_data.parquet')
df_test=pd.read_parquet('test_data.parquet')

In [3]:
# changing the datatype of multiple columns from object to numerical

def obj_to_num(df,low,high,arr):
    all_f_cols = [f'f{i}' for i in range(low, high+1)]
    exclude_cols = arr
    cols_to_convert = [col for col in all_f_cols if col not in exclude_cols]

    df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

    return df    

In [4]:
# one hot encoder function

def one_hot(df,col):
    df[col] = df[col].fillna('Unknown')  # or 'None'

    encoded = pd.get_dummies(df[col], prefix=col)
    
    df = pd.concat([df, encoded], axis=1)
    df.drop(columns=[col,f'{col}_Unknown'],inplace=True, errors='ignore')


    return df    

In [5]:
# changing data type to numerical for relevant columns

df_train=obj_to_num(df_train,1,51,['f42','f48','f50'])
df_train=obj_to_num(df_train,58,225,[])
df_train=obj_to_num(df_train,226,366,['f349', 'f354'])

# changing the datatype of dependent variable

df_train['y'] = pd.to_numeric(df_train['y'], errors='coerce')

df_train['f48'] = pd.to_numeric(df_train['f48'], errors='coerce')
df_train['f349'] = pd.to_numeric(df_train['f349'], errors='coerce')

In [6]:
# changing data type to numerical for relevant columns

df_test=obj_to_num(df_test,1,51,['f42','f48','f50'])
df_test=obj_to_num(df_test,58,225,[])
df_test=obj_to_num(df_test,226,366,['f349', 'f354'])

# changing the datatype of dependent variable

# df_train['y'] = pd.to_numeric(df_train['y'], errors='coerce')

df_test['f48'] = pd.to_numeric(df_test['f48'], errors='coerce')
df_test['f349'] = pd.to_numeric(df_test['f349'], errors='coerce')

In [7]:
df_test = one_hot(df_test, 'f42')
df_train = one_hot(df_train, 'f42')

In [8]:

df_test = one_hot(df_test, 'f54')
df_test = one_hot(df_test, 'f55')
df_test = one_hot(df_test, 'f56')
df_test = one_hot(df_test, 'f57')

In [9]:

df_train = one_hot(df_train, 'f54')
df_train = one_hot(df_train, 'f55')
df_train = one_hot(df_train, 'f56')
df_train = one_hot(df_train, 'f57')

In [10]:
df_train['f53'] = df_train['f53'].map({'NY': 1, 'NN': 0})
# df['your_column'] = df['your_column'].fillna(0)  # or .fillna(-1) if you want to distinguish None

df_train['f53'] = df_train['f53'].replace({None: np.nan})

In [11]:
## changin the data type with binary type categorical data

df_train['f50'] = df_train['f50'].map({'Y': 1, 'N': 0})
# df['your_column'] = df['your_column'].fillna(0)  # or .fillna(-1) if you want to distinguish None

df_train['f50'] = df_train['f50'].replace({None: np.nan})


df_train['f52'] = df_train['f52'].map({'Y': 1, 'N': 0})
# df['your_column'] = df['your_column'].fillna(0)  # or .fillna(-1) if you want to distinguish None

df_train['f52'] = df_train['f52'].replace({None: np.nan})


df_train['f354'] = df_train['f354'].replace({
    'Phase_1': 0,
    'Rest': 1,
    None: np.nan  # Explicitly convert None to NaN
})


  df_train['f354'] = df_train['f354'].replace({


In [12]:
## changin the data type with binary type categorical data

df_test['f50'] = df_test['f50'].map({'Y': 1, 'N': 0})
#df['your_column'] = df['your_column'].fillna(0)  # or .fillna(-1) if you want to distinguish None
 
df_test['f50'] = df_test['f50'].replace({None: np.nan})


df_test['f52'] = df_test['f52'].map({'Y': 1, 'N': 0})
# df['your_column'] = df['your_column'].fillna(0)  # or .fillna(-1) if you want to distinguish None

df_test['f52'] = df_test['f52'].replace({None: np.nan})


df_test['f354'] = df_test['f354'].replace({
    'Phase_1': 0,
    'Rest': 1,
    None: np.nan  # Explicitly convert None to NaN
})


  df_test['f354'] = df_test['f354'].replace({


In [13]:
df_test['f53'] = df_test['f53'].map({'NY': 1, 'NN': 0})
# df['your_column'] = df['your_column'].fillna(0)  # or .fillna(-1) if you want to distinguish None

df_test['f53'] = df_test['f53'].replace({None: np.nan})

In [14]:
# drops the columns with only unique values 0 and NaN

def drop_too(df,low,high):
    cols = [f"f{i}" for i in range(low, high+1)]
    
    # Drop columns where the only unique values are [0.0, NaN] or [0, np.nan]
    cols_to_drop = [
        col for col in cols 
        if set(df[col].dropna().unique()) == {0} and df[col].isnull().any()
    ]
    
    # Now drop them from the DataFrame
    df= df.drop(columns=cols_to_drop, inplace=True),
    return df,print(f"Dropped columns: {cols_to_drop}")

In [15]:
drop_too(df_train,226,309)

Dropped columns: ['f226', 'f229', 'f236', 'f238', 'f240', 'f243', 'f245', 'f246', 'f248', 'f249', 'f258', 'f259', 'f260', 'f262', 'f266', 'f267', 'f268', 'f270', 'f271', 'f277', 'f279', 'f281', 'f286', 'f287', 'f290', 'f291', 'f294', 'f295', 'f298', 'f300', 'f301', 'f303', 'f304', 'f307', 'f308', 'f309']


((None,), None)

In [16]:
drop_too(df_test,226,309)

Dropped columns: ['f226', 'f229', 'f236', 'f238', 'f240', 'f243', 'f245', 'f246', 'f248', 'f249', 'f258', 'f259', 'f260', 'f262', 'f266', 'f267', 'f268', 'f270', 'f271', 'f277', 'f279', 'f281', 'f286', 'f287', 'f290', 'f291', 'f294', 'f295', 'f298', 'f300', 'f301', 'f303', 'f304', 'f307', 'f308', 'f309']


((None,), None)

In [17]:
def drop_zero_nan_columns(df):
    """
    Drops columns from the DataFrame where the only unique non-null value is 0
    and the rest are NaNs.
    
    Example: [0, NaN, NaN, 0] → drop
    """
    cols_to_drop = [
        col for col in df.columns
        if set(df[col].dropna().unique()) == {0} and df[col].isnull().any()
    ]
    
    df = df.drop(columns=cols_to_drop)
    print(f"✅ Dropped columns: {cols_to_drop}")
    return df


In [18]:
df_train=drop_zero_nan_columns(df_train)
df_test=drop_zero_nan_columns(df_test)

✅ Dropped columns: ['f14', 'f15', 'f16', 'f17', 'f19', 'f20', 'f21', 'f23', 'f24', 'f25', 'f62', 'f66', 'f80', 'f88', 'f102', 'f128', 'f129', 'f144', 'f145', 'f334', 'f335']
✅ Dropped columns: ['f14', 'f15', 'f16', 'f19', 'f20', 'f21', 'f23', 'f24', 'f25', 'f62', 'f66', 'f71', 'f88', 'f102', 'f128', 'f129', 'f144', 'f145', 'f334', 'f335']


In [19]:
def bool_con(df):
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)
    return df

In [20]:
df_train=bool_con(df_train)

In [21]:
df_test=bool_con(df_test)

In [22]:
cols_to_drop = ['f59', 'f76']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)

cols_to_drop = ['f59', 'f76']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)

cols_to_drop = ['f123', 'f124','f125','f126']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)

cols_to_drop = ['f123', 'f124','f125','f126']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)

cols_to_drop = ['f146']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)

cols_to_drop = ['f146']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)

cols_to_drop = ['f199','f200','f201']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)
cols_to_drop = ['f199','f200','f201']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)

cols_to_drop = ['f310']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)
cols_to_drop = ['f310']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)

cols_to_drop = ['f332']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)
cols_to_drop = ['f332']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)

cols_to_drop = ['f353']
df_train.drop(columns=[col for col in cols_to_drop if col in df_train.columns], inplace=True)
cols_to_drop = ['f353']
df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns], inplace=True)


In [23]:
df_train.shape

(770164, 325)

In [24]:
df_test.shape

(369301, 325)

In [25]:
# df_train['id4'] = pd.to_datetime(df_train['id4'], errors='coerce')

df_train['id4'] = pd.to_datetime(df_train['id4'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df_test['id4'] = pd.to_datetime(df_test['id4'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df_train['id5'] = pd.to_datetime(df_train['id5'], format='%Y-%m-%d', errors='coerce')
df_test['id5'] = pd.to_datetime(df_test['id5'], format='%Y-%m-%d', errors='coerce')

In [26]:
def find_binary_columns(df):
    binary_cols = []
    for col in df.columns:
        unique_vals = df[col].dropna().unique()
        if set(unique_vals).issubset({0, 1}):
            binary_cols.append(col)
    
    print(f"✅ Found {len(binary_cols)} binary columns:")
    for col in binary_cols:
        print(f"  - {col}")
        
    return binary_cols


In [27]:
binary_columns = find_binary_columns(df_train)

# Step 2: Select non-binary columns
non_binary_columns = [col for col in df_train.columns if col not in binary_columns]
extra_cols = [f"f{i}" for i in range(226, 310) if f"f{i}" in df_train.columns]

# Step 3: Combine both lists
final_cols_to_exclude = binary_columns + extra_cols

✅ Found 90 binary columns:
  - y
  - f18
  - f50
  - f52
  - f53
  - f112
  - f122
  - f135
  - f136
  - f205
  - f227
  - f228
  - f230
  - f231
  - f232
  - f233
  - f234
  - f235
  - f237
  - f239
  - f241
  - f242
  - f244
  - f247
  - f250
  - f251
  - f252
  - f253
  - f254
  - f255
  - f256
  - f257
  - f261
  - f263
  - f264
  - f265
  - f269
  - f272
  - f273
  - f274
  - f275
  - f276
  - f278
  - f280
  - f282
  - f283
  - f284
  - f285
  - f288
  - f289
  - f292
  - f293
  - f296
  - f297
  - f299
  - f302
  - f305
  - f306
  - f333
  - f354
  - f359
  - f360
  - f42_G
  - f42_P
  - f42_R
  - f42_S
  - f54_A
  - f54_B
  - f54_C
  - f54_D
  - f54_E
  - f54_F
  - f55_-
  - f55_A
  - f55_C
  - f55_D
  - f55_G
  - f55_H
  - f55_I
  - f55_M
  - f55_T
  - f55_W
  - f56_B
  - f56_D
  - f56_G
  - f56_S
  - f57_A
  - f57_H
  - f57_P
  - f57_Z


In [28]:
df_train['f292'].unique()

array([ 0.,  1., nan])

In [29]:
# # Identify columns to drop from train
# threshold = 0.995
# drop_cols = df_train.columns[df_train.isna().mean() > threshold]

# # Drop from both train and test
# df_train = df_train.drop(columns=drop_cols)
# df_test = df_test.drop(columns=drop_cols)









Feature Engineering

In [30]:
event_features=pd.read_csv('event_features.csv')
offer_features=pd.read_csv('offer_features.csv')


In [31]:
transaction_features=pd.read_csv('transaction_features.csv')
# Merge df_train with transaction_features using 'id3'



In [32]:
transaction_features.head()

Unnamed: 0,id2,f367_sum,f367_mean,f367_std,f367_max,f367_min,signed_amount_sum,signed_amount_mean,f368_nunique,f374_nunique,id8_nunique,datetime_count,datetime_<lambda_0>,afternoon,evening,morning,night,industry_code,id8,id3
0,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,11148.0
1,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,556241.0
2,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,96003.0
3,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,547394.0
4,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,731777.0


In [33]:
transaction_features['id8'] = transaction_features['id8'].astype(str)
offer_features['id8'] = offer_features['id8'].astype(str)


In [35]:
merged_df = pd.merge(transaction_features, offer_features, on='id8', how='left')
new_merge = pd.merge()

MemoryError: Unable to allocate 27.6 GiB for an array with shape (3704576896,) and data type int64

In [36]:
df_train['id3'] = df_train['id3'].astype(str)
offer_features['id3'] = offer_features['id3'].astype(str)


In [37]:
df_train = df_train.merge(offer_features, on='id3', how='left')


In [38]:
df_test=df_test.merge(offer_features, on='id3', how='left')

In [39]:
df_train['id3'] = df_train['id3'].astype(str)
event_features['id3'] = event_features['id3'].astype(str)


In [40]:
df_train = df_train.merge(event_features, on='id3', how='left')
df_test = df_test.merge(event_features, on='id3', how='left')


In [41]:
transaction_features.head()

Unnamed: 0,id2,f367_sum,f367_mean,f367_std,f367_max,f367_min,signed_amount_sum,signed_amount_mean,f368_nunique,f374_nunique,id8_nunique,datetime_count,datetime_<lambda_0>,afternoon,evening,morning,night,industry_code,id8,id3
0,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,11148.0
1,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,556241.0
2,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,96003.0
3,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,547394.0
4,2000010,1988.86,165.738333,229.815205,625.85,1.0,0.0,0.0,1,6,7,12,0,1,0,4,7,59420000.0,59420000.0,731777.0


In [43]:
df_train['id3'] = df_train['id3'].astype(str)
transaction_features['id3'] = transaction_features['id3'].astype(str)


In [44]:
df_train = df_train.merge(transaction_features, on='id3', how='left')
df_test = df_test.merge(transaction_features, on='id3', how='left')

In [45]:
df_train.head()

Unnamed: 0,id1,id2_x,id3,id4,id5,y,f1,f2,f3,f4,...,f374_nunique,id8_nunique,datetime_count,datetime_<lambda_0>,afternoon,evening,morning,night,industry_code,id8_y
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,,,,,,,,,,
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,,,,,,,,,,
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,,,,,,,,,,
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,,,,,,,,,,
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,,,,,,,,,,


In [48]:
df_train['afternoon'].unique()

array([nan])

In [30]:
for col in df_train.columns:
    if col not in binary_columns and df_train[col].isna().any():
        df_train[col].fillna(df_train[col].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].median(), inplace=True)


In [29]:
for col in df_test.columns:
    if col not in binary_columns and df_test[col].isna().any():
        if col in df_train.columns:
            median_val = df_train[col].median()
            df_test[col].fillna(median_val, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(median_val, inplace=True)


In [39]:
# df_train['id3'] = df_train['id3'].astype(str)
# transaction_features['id3'] = transaction_features['id3'].astype(str)

In [40]:
# df_train = df_train.merge(transaction_features, on='id3', how='left')

In [41]:
# df_test = df_test.merge(transaction_features, on='id3', how='left')


In [42]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770164 entries, 0 to 770163
Columns: 492 entries, id1 to clicks_short
dtypes: datetime64[ns](2), float64(448), int64(39), object(3)
memory usage: 2.8+ GB


In [43]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369301 entries, 0 to 369300
Columns: 492 entries, id1 to clicks_short
dtypes: datetime64[ns](2), float64(457), int64(30), object(3)
memory usage: 1.4+ GB


In [44]:
df_train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,num_unique_users,num_users_clicked,min_click_delay,max_click_delay,std_click_delay,clicks_immediate,clicks_long,clicks_medium,clicks_no_click,clicks_short
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,31.0,31.0,40.0,...,10912,1083.0,0.002516,2405.285562,134.504541,916,2,31,17146,143
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,31.0,31.0,40.0,...,11788,948.0,0.000413,3129.141752,180.042932,774,3,37,19814,152
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,31.0,31.0,40.0,...,10976,752.0,0.002397,2533.306814,169.449032,592,2,39,17537,126
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,31.0,31.0,40.0,...,10840,759.0,0.00587,2948.89671,194.759984,583,3,33,17241,152
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,31.0,31.0,40.0,...,11090,774.0,0.01264,2648.256892,185.241993,608,2,37,17644,137


In [45]:
diff_cols = [col for col in df_test.columns if col not in df_train.columns]

print(f"🧪 Columns in test but not in train ({len(diff_cols)}):")
for col in diff_cols:
    print(f"  - {col}")


🧪 Columns in test but not in train (2):
  - f17
  - f80


In [46]:
extra_train_cols = [col for col in df_train.columns if col not in df_test.columns]

print(f"🎓 Columns in train but not in test ({len(extra_train_cols)}):")
for col in extra_train_cols:
    print(f"  - {col}")


🎓 Columns in train but not in test (2):
  - y
  - f71


In [47]:
df_test=df_test.drop(columns=['f17','f80'])
df_train=df_train.drop(columns=['f71'])

In [48]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770164 entries, 0 to 770163
Columns: 491 entries, id1 to clicks_short
dtypes: datetime64[ns](2), float64(447), int64(39), object(3)
memory usage: 2.8+ GB


In [49]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369301 entries, 0 to 369300
Columns: 490 entries, id1 to clicks_short
dtypes: datetime64[ns](2), float64(455), int64(30), object(3)
memory usage: 1.3+ GB


In [50]:
# df_train=df_train.fillna(0)
# df_test=df_test.fillna(0)

In [51]:
# from sklearn.model_selection import train_test_split
# import lightgbm as lgb
# from lightgbm import early_stopping, log_evaluation
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif


In [52]:
# def clean_column_names(df):
#     df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
#     return df

# df_train = clean_column_names(df_train)
# df_test = clean_column_names(df_test)

In [53]:
# # Step 1: Prepare data
# exclude_cols = ['y', 'id3', 'id4', 'id5', 'id1']  # keep id2 for grouping
# features = [col for col in df_train.columns if col not in exclude_cols]

# X = df_train[features]
# y = df_train['y']
# groups = df_train['id2']  # for ranking group

In [54]:
# # Step 2: Split data
# X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_split(
#     X, y, groups, test_size=0.2, random_state=42, stratify=y
# )


In [55]:
# X_train = X_train.copy()
# X_val = X_val.copy()
# X_train = X_train.drop(columns=['id2'], errors='ignore')
# X_val = X_val.drop(columns=['id2'], errors='ignore')

In [56]:
# group_train_sizes = groups_train.groupby(groups_train).size().tolist()
# group_val_sizes = groups_val.groupby(groups_val).size().tolist()

# lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train_sizes)
# lgb_val = lgb.Dataset(X_val, label=y_val, group=group_val_sizes, reference=lgb_train)


In [57]:
# from sklearn.model_selection import GroupKFold
# from sklearn.feature_selection import SelectKBest, f_classif
# import lightgbm as lgb
# import numpy as np

# params = {
#     'learning_rate': 0.03,
#     'num_leaves': 32,          # was 16
#     'max_depth': 6,            # was 4
#     'min_child_samples': 100,  # was 300
#     'feature_fraction': 0.8,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'lambda_l1': 1,
#     'lambda_l2': 1,
#     'n_estimators': 5000,
#     'objective': 'lambdarank',
#     'metric': 'auc',
#     'boosting_type': 'gbdt',
#     'verbosity': -1,
#     'n_jobs': -1
# }

# # ✅ Define selected features (manually, no SelectKBest)


# kf = GroupKFold(n_splits=5)
# fold = 1
# auc_scores = []

# for train_idx, val_idx in kf.split(X, y, groups):
#     print(f"\n📂 Fold {fold}")
    
#     X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#     # Extract and drop group column
#     group_train = X_train.pop('id2')
#     group_val = X_val.pop('id2')

#     # # Feature Selection
#     # selector = SelectKBest(score_func=f_classif, k=250)
#     # selector.fit(X_train, y_train)

#     # X_train_selected = selector.transform(X_train)
#     # X_val_selected = selector.transform(X_val)
#     # selected_features = X_train.columns[selector.get_support()]

#     # Convert back to DataFrame
#     # X_train = pd.DataFrame(X_train_selected, columns=selected_features, index=X_train.index)
#     # X_val = pd.DataFrame(X_val_selected, columns=selected_features, index=X_val.index)

#     # Group sizes
#     group_train_sizes = group_train.groupby(group_train).size().tolist()
#     group_val_sizes = group_val.groupby(group_val).size().tolist()

#     lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train_sizes)
#     lgb_val = lgb.Dataset(X_val, label=y_val, group=group_val_sizes, reference=lgb_train)

#     model = lgb.train(
#         params,
#         lgb_train,
#         valid_sets=[lgb_train, lgb_val],
#         valid_names=['train', 'valid'],
#         num_boost_round=500,
#         callbacks=[
#             early_stopping(stopping_rounds=50),
#             log_evaluation(period=50)
#         ]
#     )

#     auc = model.best_score['valid']['auc']
#     auc_scores.append(auc)
#     print(f"✅ Fold {fold} AUC: {auc:.5f}")
#     fold += 1

# print(f"\n📊 Average AUC over folds: {np.mean(auc_scores):.5f}")


In [58]:
# # Retain ID columns separately for later use in submission
# id_cols = df_test[['id1', 'id2', 'id3', 'id5']].copy()

# # Prepare test features by dropping ID columns
# # Get list of features that exist in both df_test and selected_features

# # Remove non-numeric features from selected_features
# numeric_features = [f for f in selected_features if pd.api.types.is_numeric_dtype(df_test[f])]

# # Use only numeric features for prediction
# X_test = df_test[numeric_features]


# # Make predictions using the trained model
# y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# # Create a final submission dataframe



In [59]:
# from sklearn.preprocessing import MinMaxScaler

# # Original predictions
# # test_preds = model.predict(X_test)

# # Scale to [0, 1]
# scaler = MinMaxScaler()
# test_preds_scaled = scaler.fit_transform(y_pred.reshape(-1, 1)).flatten()

# # Add to submission
# submission = id_cols.copy()  # id1, id2, id3, id5 already stored
# submission['pred'] = test_preds_scaled

# # Save CSV
# submission.to_csv('submission.csv', index=False)
# print("✅ Scaled submission saved as 'submission.csv'")


In [68]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import GroupShuffleSplit

# Step 1: Clean column names
df_train.columns = df_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Step 2: Prepare data
X = df_train.drop(columns=['y', 'id1', 'id3', 'id4', 'id5'])
y = df_train['y']
groups = df_train['id2']

# Step 3: Group split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train = X.iloc[train_idx].copy()
X_val = X.iloc[val_idx].copy()
y_train = y.iloc[train_idx]
y_val = y.iloc[val_idx]
group_train = groups.iloc[train_idx]
group_val = groups.iloc[val_idx]

group_train_sizes = group_train.value_counts().sort_index().values
group_val_sizes = group_val.value_counts().sort_index().values

X_train = X_train.drop(columns=['id2'])
X_val = X_val.drop(columns=['id2'])

# Step 4: LightGBM datasets
lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train_sizes)
lgb_val = lgb.Dataset(X_val, label=y_val, group=group_val_sizes)

# Step 5: Objective function
def objective(trial):
    param = {
        'objective': 'lambdarank',
        'metric': 'map',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
        'feature_pre_filter': False,  # ✅ Prevent pre-filtering
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 16, 64),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
    }

    gbm = lgb.train(
        param,
        lgb_train,
        valid_sets=[lgb_val],
        valid_names=['valid'],
        num_boost_round=1000,
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )

    return gbm.best_score['valid'].get('map', 0.0)

    # score = gbm.best_score['valid'].get('map')
    if score is None:
        print("⚠️ 'map' metric missing. Found keys:", gbm.best_score['valid'].keys())
        return 0.0
    return score

# Step 6: Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Step 7: Best results
print("\n✅ Best MAP Score:", study.best_value)
print("🏆 Best Hyperparameters:")
for k, v in study.best_params.items():
    print(f"{k}: {v}")


[I 2025-07-20 11:51:41,556] A new study created in memory with name: no-name-284c1cca-2b43-454e-a03d-35385a9f1fbb


Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.928034	valid's map@2: 0.928357	valid's map@3: 0.928586	valid's map@4: 0.929252	valid's map@5: 0.930115
[200]	valid's map@1: 0.929108	valid's map@2: 0.929753	valid's map@3: 0.930198	valid's map@4: 0.930943	valid's map@5: 0.931463


[I 2025-07-20 11:52:02,739] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 0.022737040030900227, 'num_leaves': 52, 'max_depth': 4, 'min_child_samples': 66, 'lambda_l1': 3.1204870035499646, 'lambda_l2': 2.4504408908372417, 'feature_fraction': 0.8766530247173325, 'bagging_fraction': 0.9052181436606378, 'bagging_freq': 6}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[167]	valid's map@1: 0.929968	valid's map@2: 0.93029	valid's map@3: 0.930481	valid's map@4: 0.931166	valid's map@5: 0.931715
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.930827	valid's map@2: 0.931391	valid's map@3: 0.931561	valid's map@4: 0.932451	valid's map@5: 0.932972


[I 2025-07-20 11:52:15,326] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 0.061895847989666464, 'num_leaves': 22, 'max_depth': 9, 'min_child_samples': 59, 'lambda_l1': 3.641850337475792, 'lambda_l2': 4.834610978801282, 'feature_fraction': 0.9093203753448413, 'bagging_fraction': 0.7386127190869898, 'bagging_freq': 10}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[113]	valid's map@1: 0.931901	valid's map@2: 0.931686	valid's map@3: 0.931979	valid's map@4: 0.932916	valid's map@5: 0.933429
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931257	valid's map@2: 0.930666	valid's map@3: 0.931245	valid's map@4: 0.932038	valid's map@5: 0.932507


[I 2025-07-20 11:52:22,540] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 0.08159880697562191, 'num_leaves': 18, 'max_depth': 12, 'min_child_samples': 45, 'lambda_l1': 2.597566485550483, 'lambda_l2': 3.2624004312225963, 'feature_fraction': 0.9478222871393936, 'bagging_fraction': 0.5146420313628557, 'bagging_freq': 8}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[61]	valid's map@1: 0.931472	valid's map@2: 0.930908	valid's map@3: 0.931457	valid's map@4: 0.932212	valid's map@5: 0.932736
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.93072	valid's map@2: 0.931203	valid's map@3: 0.931665	valid's map@4: 0.932578	valid's map@5: 0.933071


[I 2025-07-20 11:52:38,269] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 0.0825158177530294, 'num_leaves': 53, 'max_depth': 8, 'min_child_samples': 74, 'lambda_l1': 3.79268083719234, 'lambda_l2': 0.6985467732715039, 'feature_fraction': 0.6396780841365175, 'bagging_fraction': 0.7919838249783331, 'bagging_freq': 9}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[113]	valid's map@1: 0.932116	valid's map@2: 0.932197	valid's map@3: 0.932489	valid's map@4: 0.933128	valid's map@5: 0.933684
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931149	valid's map@2: 0.931686	valid's map@3: 0.931919	valid's map@4: 0.932572	valid's map@5: 0.932866


[I 2025-07-20 11:52:49,771] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 0.05393421178183967, 'num_leaves': 51, 'max_depth': 7, 'min_child_samples': 57, 'lambda_l1': 1.6603894855509993, 'lambda_l2': 3.408014168495405, 'feature_fraction': 0.8514212741891469, 'bagging_fraction': 0.6088156452201952, 'bagging_freq': 4}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[97]	valid's map@1: 0.931042	valid's map@2: 0.931713	valid's map@3: 0.931952	valid's map@4: 0.932597	valid's map@5: 0.932825
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.929001	valid's map@2: 0.929028	valid's map@3: 0.929652	valid's map@4: 0.930609	valid's map@5: 0.931068
[200]	valid's map@1: 0.93072	valid's map@2: 0.930317	valid's map@3: 0.931042	valid's map@4: 0.931972	valid's map@5: 0.932423


[I 2025-07-20 11:53:11,059] Trial 5 finished with value: 0.0 and parameters: {'learning_rate': 0.02752386279654539, 'num_leaves': 17, 'max_depth': 10, 'min_child_samples': 68, 'lambda_l1': 1.163062897592511, 'lambda_l2': 3.3645880401717694, 'feature_fraction': 0.6755851277262261, 'bagging_fraction': 0.9291300357889221, 'bagging_freq': 5}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[246]	valid's map@1: 0.931794	valid's map@2: 0.931284	valid's map@3: 0.931603	valid's map@4: 0.932403	valid's map@5: 0.93291
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931579	valid's map@2: 0.930747	valid's map@3: 0.931227	valid's map@4: 0.932224	valid's map@5: 0.932662


[I 2025-07-20 11:53:22,089] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 0.08306522087135637, 'num_leaves': 18, 'max_depth': 6, 'min_child_samples': 72, 'lambda_l1': 2.476873293436638, 'lambda_l2': 4.336259207764764, 'feature_fraction': 0.8893497763286417, 'bagging_fraction': 0.6194753862552239, 'bagging_freq': 10}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[123]	valid's map@1: 0.932331	valid's map@2: 0.931284	valid's map@3: 0.931842	valid's map@4: 0.93278	valid's map@5: 0.933185
Training until validation scores don't improve for 50 rounds


[I 2025-07-20 11:53:29,895] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 0.022620236825388267, 'num_leaves': 38, 'max_depth': 11, 'min_child_samples': 86, 'lambda_l1': 3.6982910892653234, 'lambda_l2': 0.057155022378611586, 'feature_fraction': 0.9612760596521275, 'bagging_fraction': 0.9362740667209568, 'bagging_freq': 4}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[15]	valid's map@1: 0.929001	valid's map@2: 0.928034	valid's map@3: 0.9277	valid's map@4: 0.928739	valid's map@5: 0.929509
Training until validation scores don't improve for 50 rounds


[I 2025-07-20 11:53:35,994] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 0.05299720234249806, 'num_leaves': 28, 'max_depth': 10, 'min_child_samples': 38, 'lambda_l1': 4.450783007141889, 'lambda_l2': 3.584476261828573, 'feature_fraction': 0.8113120598952432, 'bagging_fraction': 0.5616199424174214, 'bagging_freq': 8}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[15]	valid's map@1: 0.929753	valid's map@2: 0.929404	valid's map@3: 0.929207	valid's map@4: 0.929849	valid's map@5: 0.930777
Training until validation scores don't improve for 50 rounds


[I 2025-07-20 11:53:47,577] Trial 9 finished with value: 0.0 and parameters: {'learning_rate': 0.01806080322838683, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 25, 'lambda_l1': 2.5226648211406855, 'lambda_l2': 2.8556773000451807, 'feature_fraction': 0.9766203563411586, 'bagging_fraction': 0.8075553061884712, 'bagging_freq': 2}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[48]	valid's map@1: 0.928464	valid's map@2: 0.929538	valid's map@3: 0.929702	valid's map@4: 0.930544	valid's map@5: 0.931226
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.929108	valid's map@2: 0.92935	valid's map@3: 0.929538	valid's map@4: 0.930479	valid's map@5: 0.931168


[I 2025-07-20 11:54:01,504] Trial 10 finished with value: 0.0 and parameters: {'learning_rate': 0.03788429596237933, 'num_leaves': 46, 'max_depth': 4, 'min_child_samples': 100, 'lambda_l1': 0.05120551295241693, 'lambda_l2': 1.7123274463927682, 'feature_fraction': 0.5076262279683896, 'bagging_fraction': 0.975913533504481, 'bagging_freq': 1}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[126]	valid's map@1: 0.92986	valid's map@2: 0.929699	valid's map@3: 0.930275	valid's map@4: 0.931081	valid's map@5: 0.931742
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.930934	valid's map@2: 0.931122	valid's map@3: 0.931958	valid's map@4: 0.932492	valid's map@5: 0.933131


[I 2025-07-20 11:54:11,674] Trial 11 finished with value: 0.0 and parameters: {'learning_rate': 0.0632267413133311, 'num_leaves': 31, 'max_depth': 5, 'min_child_samples': 53, 'lambda_l1': 4.846957471550565, 'lambda_l2': 1.8123538177505036, 'feature_fraction': 0.7771337380508383, 'bagging_fraction': 0.7103372001672841, 'bagging_freq': 7}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[98]	valid's map@1: 0.931257	valid's map@2: 0.93123	valid's map@3: 0.931916	valid's map@4: 0.932499	valid's map@5: 0.933111
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.932009	valid's map@2: 0.932089	valid's map@3: 0.932185	valid's map@4: 0.932989	valid's map@5: 0.933466


[I 2025-07-20 11:54:30,494] Trial 12 finished with value: 0.0 and parameters: {'learning_rate': 0.06748696959050098, 'num_leaves': 62, 'max_depth': 9, 'min_child_samples': 83, 'lambda_l1': 3.405338547606521, 'lambda_l2': 4.330759601431013, 'feature_fraction': 0.8926838223694704, 'bagging_fraction': 0.8671968726306574, 'bagging_freq': 6}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[119]	valid's map@1: 0.932438	valid's map@2: 0.932573	valid's map@3: 0.932746	valid's map@4: 0.933121	valid's map@5: 0.933709
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931042	valid's map@2: 0.931015	valid's map@3: 0.931066	valid's map@4: 0.931991	valid's map@5: 0.932465


[I 2025-07-20 11:54:45,314] Trial 13 finished with value: 0.0 and parameters: {'learning_rate': 0.03878967088854995, 'num_leaves': 40, 'max_depth': 7, 'min_child_samples': 61, 'lambda_l1': 3.1451841620273653, 'lambda_l2': 4.9046589344805, 'feature_fraction': 0.7382095952585417, 'bagging_fraction': 0.7157591398988548, 'bagging_freq': 10}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[135]	valid's map@1: 0.931472	valid's map@2: 0.93166	valid's map@3: 0.932056	valid's map@4: 0.932887	valid's map@5: 0.933225
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.92739	valid's map@2: 0.927202	valid's map@3: 0.927485	valid's map@4: 0.92819	valid's map@5: 0.928867


[I 2025-07-20 11:54:55,983] Trial 14 finished with value: 0.0 and parameters: {'learning_rate': 0.010114260151341911, 'num_leaves': 28, 'max_depth': 4, 'min_child_samples': 44, 'lambda_l1': 4.17264603417855, 'lambda_l2': 2.1801312631763077, 'feature_fraction': 0.9010272809702272, 'bagging_fraction': 0.8662054407467378, 'bagging_freq': 3}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[99]	valid's map@1: 0.927927	valid's map@2: 0.927444	valid's map@3: 0.927629	valid's map@4: 0.928322	valid's map@5: 0.929006
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.930075	valid's map@2: 0.930424	valid's map@3: 0.931376	valid's map@4: 0.932317	valid's map@5: 0.932772


[I 2025-07-20 11:55:10,540] Trial 15 finished with value: 0.0 and parameters: {'learning_rate': 0.09959290694092343, 'num_leaves': 55, 'max_depth': 8, 'min_child_samples': 29, 'lambda_l1': 1.9125102651387562, 'lambda_l2': 1.195563576595457, 'feature_fraction': 0.8247068769022046, 'bagging_fraction': 0.6646996487633792, 'bagging_freq': 7}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[63]	valid's map@1: 0.930505	valid's map@2: 0.93123	valid's map@3: 0.931856	valid's map@4: 0.932454	valid's map@5: 0.932714
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.92986	valid's map@2: 0.9308	valid's map@3: 0.930854	valid's map@4: 0.931744	valid's map@5: 0.932303


[I 2025-07-20 11:55:27,911] Trial 16 finished with value: 0.0 and parameters: {'learning_rate': 0.04026553529476752, 'num_leaves': 43, 'max_depth': 6, 'min_child_samples': 81, 'lambda_l1': 3.2159080187101385, 'lambda_l2': 2.6375261976797857, 'feature_fraction': 0.9966717957445617, 'bagging_fraction': 0.7845271063995771, 'bagging_freq': 6}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[123]	valid's map@1: 0.929968	valid's map@2: 0.931391	valid's map@3: 0.931164	valid's map@4: 0.93201	valid's map@5: 0.932564
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.932223	valid's map@2: 0.932062	valid's map@3: 0.931943	valid's map@4: 0.933051	valid's map@5: 0.933455


[I 2025-07-20 11:55:40,790] Trial 17 finished with value: 0.0 and parameters: {'learning_rate': 0.06405275054033686, 'num_leaves': 34, 'max_depth': 9, 'min_child_samples': 63, 'lambda_l1': 4.970811431200168, 'lambda_l2': 4.084314711036803, 'feature_fraction': 0.7232158449794286, 'bagging_fraction': 0.8557407627214497, 'bagging_freq': 9}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[94]	valid's map@1: 0.932653	valid's map@2: 0.932438	valid's map@3: 0.932125	valid's map@4: 0.933224	valid's map@5: 0.933599
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.929538	valid's map@2: 0.931015	valid's map@3: 0.931418	valid's map@4: 0.932112	valid's map@5: 0.932424


[I 2025-07-20 11:55:55,123] Trial 18 finished with value: 0.0 and parameters: {'learning_rate': 0.04716320669356166, 'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 50, 'lambda_l1': 2.8085030416101517, 'lambda_l2': 4.957504337065923, 'feature_fraction': 0.9096901815937533, 'bagging_fraction': 0.7374262762779225, 'bagging_freq': 5}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[128]	valid's map@1: 0.931042	valid's map@2: 0.932143	valid's map@3: 0.93168	valid's map@4: 0.932556	valid's map@5: 0.933146
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931794	valid's map@2: 0.931955	valid's map@3: 0.932176	valid's map@4: 0.932634	valid's map@5: 0.933258


[I 2025-07-20 11:56:05,979] Trial 19 finished with value: 0.0 and parameters: {'learning_rate': 0.07149033873460064, 'num_leaves': 23, 'max_depth': 9, 'min_child_samples': 96, 'lambda_l1': 4.070777318408869, 'lambda_l2': 2.1906543431518486, 'feature_fraction': 0.8532407603441001, 'bagging_fraction': 0.6698439692714593, 'bagging_freq': 8}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[101]	valid's map@1: 0.932009	valid's map@2: 0.932035	valid's map@3: 0.932149	valid's map@4: 0.932879	valid's map@5: 0.933377
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.928571	valid's map@2: 0.92978	valid's map@3: 0.929657	valid's map@4: 0.930707	valid's map@5: 0.931307


[I 2025-07-20 11:56:15,184] Trial 20 finished with value: 0.0 and parameters: {'learning_rate': 0.029746172565786284, 'num_leaves': 58, 'max_depth': 5, 'min_child_samples': 34, 'lambda_l1': 0.86523569422903, 'lambda_l2': 1.1446219919553409, 'feature_fraction': 0.5902584019584615, 'bagging_fraction': 0.9853946850565366, 'bagging_freq': 7}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[55]	valid's map@1: 0.929108	valid's map@2: 0.929216	valid's map@3: 0.929422	valid's map@4: 0.930062	valid's map@5: 0.930866
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.930183	valid's map@2: 0.931203	valid's map@3: 0.931158	valid's map@4: 0.93188	valid's map@5: 0.93228


[I 2025-07-20 11:56:26,663] Trial 21 finished with value: 0.0 and parameters: {'learning_rate': 0.08035028291216477, 'num_leaves': 23, 'max_depth': 12, 'min_child_samples': 44, 'lambda_l1': 2.1881232819606304, 'lambda_l2': 2.88325882033042, 'feature_fraction': 0.937789860206591, 'bagging_fraction': 0.533323711898182, 'bagging_freq': 9}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[134]	valid's map@1: 0.931042	valid's map@2: 0.931364	valid's map@3: 0.931603	valid's map@4: 0.932437	valid's map@5: 0.93288
Training until validation scores don't improve for 50 rounds


[I 2025-07-20 11:56:34,033] Trial 22 finished with value: 0.0 and parameters: {'learning_rate': 0.09761011450063305, 'num_leaves': 20, 'max_depth': 11, 'min_child_samples': 48, 'lambda_l1': 2.8254536830495107, 'lambda_l2': 3.9464277435904362, 'feature_fraction': 0.9324469800648865, 'bagging_fraction': 0.5142925828568201, 'bagging_freq': 8}. Best is trial 0 with value: 0.0.


[100]	valid's map@1: 0.93072	valid's map@2: 0.931257	valid's map@3: 0.93094	valid's map@4: 0.931956	valid's map@5: 0.932235
Early stopping, best iteration is:
[52]	valid's map@1: 0.931579	valid's map@2: 0.930934	valid's map@3: 0.931167	valid's map@4: 0.931964	valid's map@5: 0.932285
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931149	valid's map@2: 0.931069	valid's map@3: 0.931239	valid's map@4: 0.932321	valid's map@5: 0.932909


[I 2025-07-20 11:56:44,859] Trial 23 finished with value: 0.0 and parameters: {'learning_rate': 0.07503071538466108, 'num_leaves': 24, 'max_depth': 11, 'min_child_samples': 65, 'lambda_l1': 3.5943924993676495, 'lambda_l2': 3.1036989667892922, 'feature_fraction': 0.8556192314625586, 'bagging_fraction': 0.5761166386880586, 'bagging_freq': 10}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[109]	valid's map@1: 0.931257	valid's map@2: 0.93166	valid's map@3: 0.93129	valid's map@4: 0.932602	valid's map@5: 0.933108
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.931257	valid's map@2: 0.930532	valid's map@3: 0.930964	valid's map@4: 0.931789	valid's map@5: 0.932283


[I 2025-07-20 11:56:54,269] Trial 24 finished with value: 0.0 and parameters: {'learning_rate': 0.08845831085103383, 'num_leaves': 16, 'max_depth': 10, 'min_child_samples': 58, 'lambda_l1': 2.8298073685046403, 'lambda_l2': 2.255728594474978, 'feature_fraction': 0.7974615313462294, 'bagging_fraction': 0.670407176111369, 'bagging_freq': 9}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[99]	valid's map@1: 0.931364	valid's map@2: 0.930424	valid's map@3: 0.93106	valid's map@4: 0.93183	valid's map@5: 0.932325
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.932653	valid's map@2: 0.932062	valid's map@3: 0.932179	valid's map@4: 0.932821	valid's map@5: 0.933488


[I 2025-07-20 11:57:04,518] Trial 25 finished with value: 0.0 and parameters: {'learning_rate': 0.09019291723251008, 'num_leaves': 35, 'max_depth': 8, 'min_child_samples': 42, 'lambda_l1': 1.5119927099053827, 'lambda_l2': 1.6178824902432778, 'feature_fraction': 0.9502049040462256, 'bagging_fraction': 0.9113301465348269, 'bagging_freq': 8}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[69]	valid's map@1: 0.931794	valid's map@2: 0.931955	valid's map@3: 0.932173	valid's map@4: 0.933015	valid's map@5: 0.933483
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.930934	valid's map@2: 0.931552	valid's map@3: 0.93163	valid's map@4: 0.93226	valid's map@5: 0.932837


[I 2025-07-20 11:57:18,348] Trial 26 finished with value: 0.0 and parameters: {'learning_rate': 0.05784720168877098, 'num_leaves': 26, 'max_depth': 12, 'min_child_samples': 76, 'lambda_l1': 2.1730643102477583, 'lambda_l2': 3.795095215201849, 'feature_fraction': 0.8739027839036343, 'bagging_fraction': 0.7612636996208504, 'bagging_freq': 6}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[136]	valid's map@1: 0.932009	valid's map@2: 0.93166	valid's map@3: 0.932065	valid's map@4: 0.932567	valid's map@5: 0.933186
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.93029	valid's map@2: 0.930827	valid's map@3: 0.931152	valid's map@4: 0.9318	valid's map@5: 0.932143
[200]	valid's map@1: 0.931472	valid's map@2: 0.931713	valid's map@3: 0.931904	valid's map@4: 0.932621	valid's map@5: 0.933027


[I 2025-07-20 11:57:34,187] Trial 27 finished with value: 0.0 and parameters: {'learning_rate': 0.044826390368266494, 'num_leaves': 20, 'max_depth': 7, 'min_child_samples': 20, 'lambda_l1': 3.0678273414508346, 'lambda_l2': 4.616143043381388, 'feature_fraction': 0.993271858200303, 'bagging_fraction': 0.8257945775469722, 'bagging_freq': 7}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[179]	valid's map@1: 0.931364	valid's map@2: 0.932035	valid's map@3: 0.932071	valid's map@4: 0.932612	valid's map@5: 0.933098
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.930934	valid's map@2: 0.930317	valid's map@3: 0.930961	valid's map@4: 0.93192	valid's map@5: 0.932481


[I 2025-07-20 11:57:45,586] Trial 28 finished with value: 0.0 and parameters: {'learning_rate': 0.07363302999071468, 'num_leaves': 44, 'max_depth': 5, 'min_child_samples': 55, 'lambda_l1': 4.364487353293372, 'lambda_l2': 2.5539415142973443, 'feature_fraction': 0.9303281012149861, 'bagging_fraction': 0.6225852818607901, 'bagging_freq': 10}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[142]	valid's map@1: 0.932116	valid's map@2: 0.931525	valid's map@3: 0.931743	valid's map@4: 0.932708	valid's map@5: 0.933257
Training until validation scores don't improve for 50 rounds
[100]	valid's map@1: 0.93029	valid's map@2: 0.931445	valid's map@3: 0.931883	valid's map@4: 0.932188	valid's map@5: 0.932813


[I 2025-07-20 11:57:58,458] Trial 29 finished with value: 0.0 and parameters: {'learning_rate': 0.06088714097554036, 'num_leaves': 54, 'max_depth': 9, 'min_child_samples': 71, 'lambda_l1': 3.848189615857711, 'lambda_l2': 3.1414837692757795, 'feature_fraction': 0.7679069868351465, 'bagging_fraction': 0.8931321528480576, 'bagging_freq': 9}. Best is trial 0 with value: 0.0.


Early stopping, best iteration is:
[76]	valid's map@1: 0.931579	valid's map@2: 0.93131	valid's map@3: 0.931651	valid's map@4: 0.932325	valid's map@5: 0.932854

✅ Best MAP Score: 0.0
🏆 Best Hyperparameters:
learning_rate: 0.022737040030900227
num_leaves: 52
max_depth: 4
min_child_samples: 66
lambda_l1: 3.1204870035499646
lambda_l2: 2.4504408908372417
feature_fraction: 0.8766530247173325
bagging_fraction: 0.9052181436606378
bagging_freq: 6


In [69]:
# Clean feature names again (in case new columns are added later)
df_train.columns = df_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Drop problematic object/datetime columns
df_full = df_train.drop(columns=['id1', 'id3', 'id4', 'id5'])

# Define features and labels
X_full = df_full.drop(columns=['y', 'id2'])
y_full = df_full['y']
groups_full = df_full['id2']

# Compute group sizes
group_full_sizes = groups_full.value_counts().sort_index().values


In [70]:
lgb_full = lgb.Dataset(X_full, label=y_full, group=group_full_sizes)


In [71]:
best_params = study.best_params

# Add required fixed parameters
best_params.update({
    'objective': 'lambdarank',
    'metric': 'map',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
    'feature_pre_filter': False  # important!
})


In [72]:
final_model = lgb.train(
    best_params,
    lgb_full,
    num_boost_round=1000
)


In [73]:
from sklearn.metrics import label_ranking_average_precision_score

# Step 1: Predict scores
y_pred = final_model.predict(X_val)

# Step 2: Prepare data for MAP calculation
# Group the predictions and true labels back into their original sessions/queries
X_val_with_ids = X_val.copy()
X_val_with_ids['y_true'] = y_val
X_val_with_ids['y_pred'] = y_pred
X_val_with_ids['id2'] = df_train['id2'].iloc[X_val.index]  # reattach group ids

# Step 3: Compute MAP@K manually
average_precisions = []

for session_id, group in X_val_with_ids.groupby('id2'):
    if group['y_true'].sum() == 0:
        continue  # skip groups with no positive labels
    sorted_group = group.sort_values('y_pred', ascending=False)
    y_true = sorted_group['y_true'].values
    average_precision = label_ranking_average_precision_score([y_true], [y_true])
    average_precisions.append(average_precision)

# Final MAP score
final_map_score = np.mean(average_precisions)
print(f"✅ Final MAP Score: {final_map_score:.5f}")


✅ Final MAP Score: 1.00000


In [76]:
df_test.columns = df_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)


In [77]:
X_test = df_test.drop(columns=['id1', 'id2', 'id3', 'id4', 'id5'], errors='ignore')


In [84]:
# df_test['pred'] = final_model.predict(X_test)


In [83]:
# 1. Make sure the test features match the training features
X_test = df_test[X_train.columns]  # X_train is the feature matrix used during training

# 2. Predict using the final trained model
raw_preds = final_model.predict(X_test)

# 3. Add raw predictions to df_test
df_test['raw_pred'] = raw_preds

# 4. Normalize predictions within each group (based on 'id2')
#    So values are between 0 and 1 within each group
df_test['pred'] = df_test.groupby('id2')['raw_pred'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8)
)

# 5. Save to CSV with required columns
df_test[['id1', 'id2', 'id3', 'id5', 'pred']].to_csv("predictions.csv", index=False)

print("✅ Saved predictions with 'pred' column to predictions.csv")


✅ Saved predictions with 'pred' column to predictions.csv
