In [1]:
# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
train_user = pd.read_csv('train_data_ads.csv')
train_adv = pd.read_csv('train_data_feeds.csv')

test_user  = pd.read_csv('test_data_ads.csv')
test_adv   = pd.read_csv('test_data_feeds.csv')

In [3]:
test_adv.head()

Unnamed: 0,u_userId,u_phonePrice,u_browserLifeCycle,u_browserMode,u_feedLifeCycle,u_refreshTimes,u_newsCatInterests,u_newsCatDislike,u_newsCatInterestsST,u_click_ca2_news,...,e_ch,e_m,e_po,e_pl,e_rn,e_section,e_et,label,cillabel,pro
0,215116,16,17,14,11,0,65^72^42^65^104,0,219^50^151^21^140,219^50^151^114^21,...,19,998,9,2449,13,0,202206100922,-1,-1,0
1,215116,16,17,14,11,0,65^72^42^65^104,0,219^50^151^21^140,219^50^151^114^21,...,19,998,7,2449,14,0,202206100924,-1,-1,0
2,215116,16,17,14,11,0,65^72^42^65^104,0,0,65^104^98^168^0,...,19,998,14,2449,1,0,202206100907,1,-1,40
3,215116,16,17,14,11,0,65^72^42^65^104,0,219^50^151^21^140,219^50^151^114^21,...,19,998,5,2449,11,0,202206100921,-1,-1,0
4,215116,16,17,14,11,0,65^72^42^65^104,0,219^50^151^21^140,219^50^151^114^21,...,19,998,3,2449,14,0,202206100922,-1,-1,0


In [4]:
train_adv.rename(columns={'u_userId': 'user_id'}, inplace=True)
test_adv.rename(columns={'u_userId': 'user_id'}, inplace=True)

In [5]:
user_var = [
    'user_id',
    'log_id',
    'age',
    'gender',
    'residence',
    'device_name',
    'device_size',
    'net_type',
    'task_id',
    'adv_id',
    'creat_type_cd'
]

adv_var = [
    'user_id',
    'label'
]

In [6]:
def safe_keep(df, cols, name):
    keep = [c for c in cols if c in df.columns]
    miss = [c for c in cols if c not in df.columns]
    if miss:
        print(f"[{name}] missing in the dataset: {miss}")
    return keep

train_user_cols = safe_keep(train_user, user_var, "train_user")
test_user_cols  = safe_keep(test_user,  user_var, "test_user")
train_adv_cols  = safe_keep(train_adv,  adv_var,  "train_adv")
test_adv_cols   = safe_keep(test_adv,   adv_var,  "test_adv")

In [7]:
train_user['istest'] = 0
test_user['istest']  = 1
data_user = pd.concat([train_user, test_user], axis=0, ignore_index=True)
del train_user, test_user
gc.collect()

0

In [8]:
train_adv['istest'] = 0
test_adv['istest']  = 1
data_adv = pd.concat([train_adv, test_adv], axis=0, ignore_index=True)
del train_adv, test_adv
gc.collect()

0

In [9]:
data_adv.head()

Unnamed: 0,user_id,u_phonePrice,u_browserLifeCycle,u_browserMode,u_feedLifeCycle,u_refreshTimes,u_newsCatInterests,u_newsCatDislike,u_newsCatInterestsST,u_click_ca2_news,...,e_m,e_po,e_pl,e_rn,e_section,e_et,label,cillabel,pro,istest
0,135880,16,17,10,17,0,195^168^109^98^108,0,195^44^168^112^21,195^168^44^112^21,...,1217,1,561,2,0,202206081521,-1,-1,0,0
1,135880,16,17,10,17,0,195^168^109^98^108,0,195^44^168^112^21,195^168^44^112^21,...,1217,9,561,1,0,202206081521,-1,-1,0,0
2,135880,16,17,10,17,0,195^168^109^98^108,0,195^44^168^112^21,195^168^44^112^21,...,1217,18,561,1,0,202206081521,-1,-1,0,0
3,135880,16,17,10,17,0,195^168^109^98^108,0,195^44^168^112^21,195^168^44^112^21,...,1217,7,561,1,1,202206081521,-1,-1,0,0
4,135880,16,17,10,17,0,195^168^109^98^108,0,195^44^168^112^21,195^168^44^112^21,...,1217,7,561,2,0,202206081522,-1,-1,0,0


In [10]:
adv_train_only = data_adv[data_adv['istest']==0].copy()

In [11]:
def to01(s):
    s = s.copy()
    s = s.replace({-1:0, 1:1})
    return s.astype(int)

adv_train_only['label01'] = to01(adv_train_only['label'])

In [12]:
# aggregate total count for each user id
user_agg = (
    adv_train_only
    .groupby('user_id', as_index=False)
    .agg(
        feeds_imps = ('label01','count'),        
        feeds_clicks = ('label01','sum'),      
        feeds_ctr = ('label01','mean')         
    )
)

In [13]:
data_user = data_user.merge(user_agg, on='user_id', how='left')

# missing data (user never appeared in feeds/adv)
for c in ['feeds_imps','feeds_clicks','feeds_ctr']:
    if c in data_user.columns:
        data_user[c] = data_user[c].fillna(0 if c!='feeds_ctr' else data_user[c].mean())

# merge final
train_merged = data_user[data_user['istest']==0].drop(columns=['istest']).reset_index(drop=True)
test_merged  = data_user[data_user['istest']==1].drop(columns=['istest']).reset_index(drop=True)

print("Train rows:", len(train_merged), " | Test rows:", len(test_merged))
print("Train columns:", len(train_merged.columns), " | Test columns:", len(test_merged.columns))

Train rows: 7675517  | Test rows: 976058
Train columns: 38  | Test columns: 38


In [14]:
train_merged.info()
test_merged.info()
train_merged.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7675517 entries, 0 to 7675516
Data columns (total 38 columns):
 #   Column                Dtype  
---  ------                -----  
 0   log_id                int64  
 1   label                 float64
 2   user_id               int64  
 3   age                   int64  
 4   gender                int64  
 5   residence             int64  
 6   city                  int64  
 7   city_rank             int64  
 8   series_dev            int64  
 9   series_group          int64  
 10  emui_dev              int64  
 11  device_name           int64  
 12  device_size           int64  
 13  net_type              int64  
 14  task_id               int64  
 15  adv_id                int64  
 16  creat_type_cd         int64  
 17  adv_prim_id           int64  
 18  inter_type_cd         int64  
 19  slot_id               int64  
 20  site_id               int64  
 21  spread_app_id         int64  
 22  hispace_app_tags      int64  
 23  app_sec

Unnamed: 0,log_id,label,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,ad_close_list_v001,ad_close_list_v002,ad_close_list_v003,pt_d,u_newsCatInterestsST,u_refreshTimes,u_feedLifeCycle,feeds_imps,feeds_clicks,feeds_ctr
0,373250,0.0,100005,3,2,16,147,2,32,6,...,24107,1218,173,202206030326,39^220^16,0,15,5.0,0.0,0.0
1,373253,1.0,100005,3,2,16,147,2,32,6,...,24107,1218,173,202206030326,39^220^16,0,15,5.0,0.0,0.0
2,373252,1.0,100005,3,2,16,147,2,32,6,...,24107,1218,173,202206030326,39^220^16,0,15,5.0,0.0,0.0
3,373251,0.0,100005,3,2,16,147,2,32,6,...,24107,1218,173,202206030326,39^220^16,0,15,5.0,0.0,0.0
4,373255,0.0,100005,3,2,16,147,2,32,6,...,24107,1218,173,202206030328,39^220^16,0,15,5.0,0.0,0.0


### Variables Summary

The following table summarizes all variables used in the CTR prediction model, their meanings, and data sources:

| Variable | Description | Data Source |
|-----------|--------------|--------------|
| *label* | Response variable: whether the user clicked the ad (1 = click, 0 = no click) | Target domain |
| *user_id* | Unique user identifier | Target domain |
| *log_id* | Unique sample identifier | Target domain |
| *age* | User age group | Target domain |
| *gender* | User gender | Target domain |
| *residence* | User residence province ID | Target domain |
| *device_name* | Device model name | Target domain |
| *device_size* | Screen size category | Target domain |
| *net_type* | Network type (e.g., WiFi, 4G, 5G) | Target domain |
| *task_id* | Ad campaign ID | Target domain |
| *adv_id* | Ad material ID | Target domain |
| *creat_type_cd* | Creative type ID | Target domain |
| *feeds_imps* | Total number of content impressions per user | Source domain (aggregated) |
| *feeds_clicks* | Total number of content clicks per user | Source domain (aggregated) |
| *feeds_ctr* | Average content CTR per user | Source domain (aggregated) |

**Model goal:**
Predict 'label' on user, ad, device, and source-domain behavior features.

**Input features (X):**  
All columns except *label*, *user_id*, and *log_id*.  

**Response variable (Y):**  
*label* (0/1)


### Baseline Models: Logistic regression

## Feature Encoding Analysis & Implementation

### One-Hot Encoding for Categorical Variables

First, let's examine the cardinality (number of unique values) of our categorical features to determine which are suitable for one-hot encoding.


In [15]:
# Check cardinality of categorical features
categorical_features = ['gender', 'age', 'residence', 'city', 'city_rank', 
                       'series_dev', 'series_group', 'emui_dev', 'device_name', 
                       'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
                       'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 
                       'spread_app_id', 'hispace_app_tags', 'app_second_class']

print("Categorical Feature Cardinality Analysis")
print("=" * 50)

for col in categorical_features:
    if col in train_merged.columns:
        n_unique = train_merged[col].nunique()
        dtype = train_merged[col].dtype
        
        # Categorize recommendation
        if n_unique <= 10:
            recommendation = "✓ One-hot encode (low cardinality)"
        elif n_unique <= 50:
            recommendation = "⚠ Consider target encoding or grouping"
        else:
            recommendation = "✗ Too high cardinality - use target/frequency encoding"
            
        print(f"{col:25} | {n_unique:6} unique | {recommendation}")

print("\n" + "=" * 50)


Categorical Feature Cardinality Analysis
gender                    |      3 unique | ✓ One-hot encode (low cardinality)
age                       |      8 unique | ✓ One-hot encode (low cardinality)
residence                 |     35 unique | ⚠ Consider target encoding or grouping
city                      |    341 unique | ✗ Too high cardinality - use target/frequency encoding
city_rank                 |      4 unique | ✓ One-hot encode (low cardinality)
series_dev                |     27 unique | ⚠ Consider target encoding or grouping
series_group              |      7 unique | ✓ One-hot encode (low cardinality)
emui_dev                  |     27 unique | ⚠ Consider target encoding or grouping
device_name               |    256 unique | ✗ Too high cardinality - use target/frequency encoding
device_size               |   1547 unique | ✗ Too high cardinality - use target/frequency encoding
net_type                  |      6 unique | ✓ One-hot encode (low cardinality)
task_id           

### Implementation: One-Hot Encoding

Based on the cardinality analysis, we'll:
1. **One-hot encode**: Features with ≤10 unique values (gender, net_type, etc.)
2. **Keep as numeric**: Features that are truly ordinal (age, city_rank) or have natural ordering
3. **Handle separately**: High cardinality features (device_name, adv_id, etc.)


In [16]:
# One-hot encode gender (confirmed low cardinality from above analysis)
print("Before encoding:")
print(f"Gender unique values: {train_merged['gender'].unique()}")
print(f"Gender value counts:\n{train_merged['gender'].value_counts().sort_index()}\n")

# Create one-hot encoded features for gender
gender_dummies_train = pd.get_dummies(train_merged['gender'], prefix='gender', drop_first=True)
gender_dummies_test = pd.get_dummies(test_merged['gender'], prefix='gender', drop_first=True)

# Drop original gender column and add one-hot encoded columns
train_encoded = train_merged.drop(columns=['gender']).reset_index(drop=True)
test_encoded = test_merged.drop(columns=['gender']).reset_index(drop=True)

# Concatenate one-hot encoded features
train_encoded = pd.concat([train_encoded, gender_dummies_train], axis=1)
test_encoded = pd.concat([test_encoded, gender_dummies_test], axis=1)

print("After one-hot encoding gender:")
print(f"New columns: {gender_dummies_train.columns.tolist()}")
print(f"Train shape: {train_encoded.shape}")
print(f"Test shape: {test_encoded.shape}")
print(f"\nGender column details:\n{gender_dummies_train.sum()}")


Before encoding:
Gender unique values: [2 4 3]
Gender value counts:
gender
2    5946172
3     550437
4    1178908
Name: count, dtype: int64

After one-hot encoding gender:
New columns: ['gender_3', 'gender_4']
Train shape: (7675517, 39)
Test shape: (976058, 39)

Gender column details:
gender_3     550437
gender_4    1178908
dtype: int64


### Discussion: Other Variables to Consider for One-Hot Encoding

**✓ Good candidates for one-hot encoding:**
- **net_type**: Network type (WiFi, 4G, 5G) - nominal, likely ≤10 categories
- **inter_type_cd**: Interaction type - likely few categories
- **series_group**: Device series group - nominal feature

**⚠ Consider but might need grouping:**
- **city_rank**: If truly ordinal (1=Tier 1, 2=Tier 2, etc.) → keep numeric
- If nominal → one-hot encode
- **creat_type_cd**: Creative type - depends on number of unique values

**✗ NOT suitable for one-hot (high cardinality):**
- **device_name**: Likely 1000s of unique device models
- **adv_id**: Unique ad IDs
- **residence, city**: Geographic IDs with many unique values
- **task_id**: Unique campaign IDs

**📊 Treatment by Feature Type:**

| Feature | Encoding Strategy | Reason |
|---------|------------------|--------|
| gender | **One-hot** | Nominal (male/female/other), low cardinality |
| age | **Numeric** | Ordinal (ordered age groups) |
| city_rank | **Numeric** | Ordinal (Tier 1 > Tier 2 > Tier 3) |
| net_type | **One-hot** | Nominal (WiFi ≠ 4G ≠ 5G), low cardinality |
| residence | **Target encoding** | High cardinality (34 provinces), nominal |
| device_name | **Target encoding** | Very high cardinality (1000s of devices) |
| adv_id | **Target encoding** | Unique ad IDs, no meaningful order |

**Why drop_first=True?**
- Prevents multicollinearity (dummy variable trap)
- For gender with k categories, we create k-1 dummy variables
- One category becomes the reference (baseline)


In [17]:
# One-hot encode net_type
print("Before encoding:")
print(f"net_type unique values: {train_encoded['net_type'].unique()}")
print(f"net_type value counts:\n{train_encoded['net_type'].value_counts().sort_index()}\n")

# Create one-hot encoded features for net_type
net_type_dummies_train = pd.get_dummies(train_encoded['net_type'], prefix='net_type', drop_first=True)
net_type_dummies_test = pd.get_dummies(test_encoded['net_type'], prefix='net_type', drop_first=True)

# Drop original net_type column and add one-hot encoded columns
train_encoded = train_encoded.drop(columns=['net_type']).reset_index(drop=True)
test_encoded = test_encoded.drop(columns=['net_type']).reset_index(drop=True)

# Concatenate one-hot encoded features
train_encoded = pd.concat([train_encoded, net_type_dummies_train], axis=1)
test_encoded = pd.concat([test_encoded, net_type_dummies_test], axis=1)

print("After one-hot encoding net_type:")
print(f"New columns: {net_type_dummies_train.columns.tolist()}")
print(f"Train shape: {train_encoded.shape}")
print(f"Test shape: {test_encoded.shape}")
print(f"\nNet_type column details:\n{net_type_dummies_train.sum()}")


Before encoding:
net_type unique values: [7 6 4 3 2 5]
net_type value counts:
net_type
2       1278
3     302668
4     872000
5       2043
6     924118
7    5573410
Name: count, dtype: int64

After one-hot encoding net_type:
New columns: ['net_type_3', 'net_type_4', 'net_type_5', 'net_type_6', 'net_type_7']
Train shape: (7675517, 43)
Test shape: (976058, 43)

Net_type column details:
net_type_3     302668
net_type_4     872000
net_type_5       2043
net_type_6     924118
net_type_7    5573410
dtype: int64


In [18]:
# Check slot_id values
print("slot_id unique values:")
print(sorted(train_merged['slot_id'].unique()))
print(f"\nTotal unique slot_id values: {train_merged['slot_id'].nunique()}")
print(f"\nslot_id value counts (top 10):")
print(train_merged['slot_id'].value_counts().head(10))


slot_id unique values:
[np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int64(70), np.int64(71)]

Total unique slot_id values: 60

slot_id value counts (top 10):
slot_id
16    1794442
17     555059
54     438012
38     388710
50     

### Target Encoding for High Cardinality Features



In [19]:
# Target encoding for slot_id (60 unique values)
# Replace each slot_id with the average CTR for that slot from training data
slot_encoding = train_merged.groupby('slot_id')['label'].mean()

# Apply to both train and test (using .map to handle missing values with fillna)
train_encoded['slot_id_encoded'] = train_encoded['slot_id'].map(slot_encoding).fillna(train_merged['label'].mean())
test_encoded['slot_id_encoded'] = test_encoded['slot_id'].map(slot_encoding).fillna(train_merged['label'].mean())

# Drop original slot_id column
train_encoded = train_encoded.drop(columns=['slot_id'])
test_encoded = test_encoded.drop(columns=['slot_id'])

print("After target encoding slot_id:")
print(f"Train shape: {train_encoded.shape}")
print(f"Test shape: {test_encoded.shape}")
print(f"\nSlot_id encoding stats:")
print(f"Min encoded value: {slot_encoding.min():.4f}")
print(f"Max encoded value: {slot_encoding.max():.4f}")
print(f"Mean encoded value: {slot_encoding.mean():.4f}")


After target encoding slot_id:
Train shape: (7675517, 43)
Test shape: (976058, 43)

Slot_id encoding stats:
Min encoded value: 0.0000
Max encoded value: 0.4444
Mean encoded value: 0.0306


In [20]:
# Target encoding for device_name (256 unique values)
device_encoding = train_merged.groupby('device_name')['label'].mean()
train_encoded['device_name_encoded'] = train_encoded['device_name'].map(device_encoding).fillna(train_merged['label'].mean())
test_encoded['device_name_encoded'] = test_encoded['device_name'].map(device_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['device_name'])
test_encoded = test_encoded.drop(columns=['device_name'])
print("Device_name encoding complete")


Device_name encoding complete


In [21]:
# Target encoding for task_id (11,209 unique values)
task_encoding = train_merged.groupby('task_id')['label'].mean()
train_encoded['task_id_encoded'] = train_encoded['task_id'].map(task_encoding).fillna(train_merged['label'].mean())
test_encoded['task_id_encoded'] = test_encoded['task_id'].map(task_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['task_id'])
test_encoded = test_encoded.drop(columns=['task_id'])
print("Task_id encoding complete")


Task_id encoding complete


In [22]:
# Target encoding for adv_id (12,615 unique values)
adv_encoding = train_merged.groupby('adv_id')['label'].mean()
train_encoded['adv_id_encoded'] = train_encoded['adv_id'].map(adv_encoding).fillna(train_merged['label'].mean())
test_encoded['adv_id_encoded'] = test_encoded['adv_id'].map(adv_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['adv_id'])
test_encoded = test_encoded.drop(columns=['adv_id'])
print("Adv_id encoding complete")


Adv_id encoding complete


In [23]:
# Target encoding for city (341 unique values)
city_encoding = train_merged.groupby('city')['label'].mean()
train_encoded['city_encoded'] = train_encoded['city'].map(city_encoding).fillna(train_merged['label'].mean())
test_encoded['city_encoded'] = test_encoded['city'].map(city_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['city'])
test_encoded = test_encoded.drop(columns=['city'])
print("City encoding complete")


City encoding complete


In [24]:
# Target encoding for adv_prim_id (545 unique values)
adv_prim_encoding = train_merged.groupby('adv_prim_id')['label'].mean()
train_encoded['adv_prim_id_encoded'] = train_encoded['adv_prim_id'].map(adv_prim_encoding).fillna(train_merged['label'].mean())
test_encoded['adv_prim_id_encoded'] = test_encoded['adv_prim_id'].map(adv_prim_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['adv_prim_id'])
test_encoded = test_encoded.drop(columns=['adv_prim_id'])
print("Adv_prim_id encoding complete")


Adv_prim_id encoding complete


In [25]:
# Final summary of encoded features
print("Encoding complete!")
print(f"Final train shape: {train_encoded.shape}")
print(f"Final test shape: {test_encoded.shape}")
print(f"\nEncoded columns: {[col for col in train_encoded.columns if '_encoded' in col]}")


Encoding complete!
Final train shape: (7675517, 43)
Final test shape: (976058, 43)

Encoded columns: ['slot_id_encoded', 'device_name_encoded', 'task_id_encoded', 'adv_id_encoded', 'city_encoded', 'adv_prim_id_encoded']


In [26]:
# Drop site_id (constant feature with only 1 unique value)
train_encoded = train_encoded.drop(columns=['site_id'])
test_encoded = test_encoded.drop(columns=['site_id'])
print(f"After dropping site_id - Train shape: {train_encoded.shape}, Test shape: {test_encoded.shape}")


After dropping site_id - Train shape: (7675517, 42), Test shape: (976058, 42)


### Encoding Remaining Categorical Features

One-hot encoding for low cardinality features (creat_type_cd, inter_type_cd, series_group) and target encoding for high cardinality feature (device_size).


In [27]:
# One-hot encode creat_type_cd (9 unique values)
creat_type_dummies_train = pd.get_dummies(train_encoded['creat_type_cd'], prefix='creat_type', drop_first=True)
creat_type_dummies_test = pd.get_dummies(test_encoded['creat_type_cd'], prefix='creat_type', drop_first=True)
train_encoded = train_encoded.drop(columns=['creat_type_cd'])
test_encoded = test_encoded.drop(columns=['creat_type_cd'])
train_encoded = pd.concat([train_encoded, creat_type_dummies_train], axis=1)
test_encoded = pd.concat([test_encoded, creat_type_dummies_test], axis=1)
print(f"creat_type_cd one-hot encoded - New columns: {creat_type_dummies_train.columns.tolist()}")


creat_type_cd one-hot encoded - New columns: ['creat_type_3', 'creat_type_4', 'creat_type_5', 'creat_type_6', 'creat_type_7', 'creat_type_8', 'creat_type_9', 'creat_type_10']


In [28]:
# One-hot encode inter_type_cd (4 unique values)
inter_type_dummies_train = pd.get_dummies(train_encoded['inter_type_cd'], prefix='inter_type', drop_first=True)
inter_type_dummies_test = pd.get_dummies(test_encoded['inter_type_cd'], prefix='inter_type', drop_first=True)
train_encoded = train_encoded.drop(columns=['inter_type_cd'])
test_encoded = test_encoded.drop(columns=['inter_type_cd'])
train_encoded = pd.concat([train_encoded, inter_type_dummies_train], axis=1)
test_encoded = pd.concat([test_encoded, inter_type_dummies_test], axis=1)
print(f"inter_type_cd one-hot encoded - New columns: {inter_type_dummies_train.columns.tolist()}")


inter_type_cd one-hot encoded - New columns: ['inter_type_3', 'inter_type_4', 'inter_type_5']


In [29]:
# One-hot encode series_group (7 unique values)
series_group_dummies_train = pd.get_dummies(train_encoded['series_group'], prefix='series_group', drop_first=True)
series_group_dummies_test = pd.get_dummies(test_encoded['series_group'], prefix='series_group', drop_first=True)
train_encoded = train_encoded.drop(columns=['series_group'])
test_encoded = test_encoded.drop(columns=['series_group'])
train_encoded = pd.concat([train_encoded, series_group_dummies_train], axis=1)
test_encoded = pd.concat([test_encoded, series_group_dummies_test], axis=1)
print(f"series_group one-hot encoded - New columns: {series_group_dummies_train.columns.tolist()}")


series_group one-hot encoded - New columns: ['series_group_3', 'series_group_4', 'series_group_5', 'series_group_6', 'series_group_7', 'series_group_8']


In [30]:
# Target encoding for device_size (1,547 unique values)
device_size_encoding = train_merged.groupby('device_size')['label'].mean()
train_encoded['device_size_encoded'] = train_encoded['device_size'].map(device_size_encoding).fillna(train_merged['label'].mean())
test_encoded['device_size_encoded'] = test_encoded['device_size'].map(device_size_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['device_size'])
test_encoded = test_encoded.drop(columns=['device_size'])
print(f"device_size target encoding complete - Train shape: {train_encoded.shape}, Test shape: {test_encoded.shape}")


device_size target encoding complete - Train shape: (7675517, 56), Test shape: (976058, 56)


### Target Encoding Remaining Medium-Cardinality Features

Encoding residence, series_dev, emui_dev, hispace_app_tags, app_second_class, and spread_app_id.


In [None]:
# Target encoding for residence (35 unique values)
residence_encoding = train_merged.groupby('residence')['label'].mean()
train_encoded['residence_encoded'] = train_encoded['residence'].map(residence_encoding).fillna(train_merged['label'].mean())
test_encoded['residence_encoded'] = test_encoded['residence'].map(residence_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['residence'])
test_encoded = test_encoded.drop(columns=['residence'])


residence encoding complete


In [32]:
# Target encoding for series_dev (27 unique values)
series_dev_encoding = train_merged.groupby('series_dev')['label'].mean()
train_encoded['series_dev_encoded'] = train_encoded['series_dev'].map(series_dev_encoding).fillna(train_merged['label'].mean())
test_encoded['series_dev_encoded'] = test_encoded['series_dev'].map(series_dev_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['series_dev'])
test_encoded = test_encoded.drop(columns=['series_dev'])


In [33]:
# Target encoding for emui_dev (27 unique values)
emui_dev_encoding = train_merged.groupby('emui_dev')['label'].mean()
train_encoded['emui_dev_encoded'] = train_encoded['emui_dev'].map(emui_dev_encoding).fillna(train_merged['label'].mean())
test_encoded['emui_dev_encoded'] = test_encoded['emui_dev'].map(emui_dev_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['emui_dev'])
test_encoded = test_encoded.drop(columns=['emui_dev'])


In [34]:
# Target encoding for hispace_app_tags (43 unique values)
hispace_encoding = train_merged.groupby('hispace_app_tags')['label'].mean()
train_encoded['hispace_app_tags_encoded'] = train_encoded['hispace_app_tags'].map(hispace_encoding).fillna(train_merged['label'].mean())
test_encoded['hispace_app_tags_encoded'] = test_encoded['hispace_app_tags'].map(hispace_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['hispace_app_tags'])
test_encoded = test_encoded.drop(columns=['hispace_app_tags'])


In [35]:
# Target encoding for app_second_class (20 unique values)
app_second_encoding = train_merged.groupby('app_second_class')['label'].mean()
train_encoded['app_second_class_encoded'] = train_encoded['app_second_class'].map(app_second_encoding).fillna(train_merged['label'].mean())
test_encoded['app_second_class_encoded'] = test_encoded['app_second_class'].map(app_second_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['app_second_class'])
test_encoded = test_encoded.drop(columns=['app_second_class'])


In [36]:
# Target encoding for spread_app_id (116 unique values)
spread_app_encoding = train_merged.groupby('spread_app_id')['label'].mean()
train_encoded['spread_app_id_encoded'] = train_encoded['spread_app_id'].map(spread_app_encoding).fillna(train_merged['label'].mean())
test_encoded['spread_app_id_encoded'] = test_encoded['spread_app_id'].map(spread_app_encoding).fillna(train_merged['label'].mean())
train_encoded = train_encoded.drop(columns=['spread_app_id'])
test_encoded = test_encoded.drop(columns=['spread_app_id'])
print("spread_app_id encoding complete")
print(f"\nFinal train shape: {train_encoded.shape}, Test shape: {test_encoded.shape}")


spread_app_id encoding complete

Final train shape: (7675517, 56), Test shape: (976058, 56)
