In [1]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [2]:
from scipy.sparse import csr_matrix
import polars as pl
import implicit
# from lightfm import LightFM

In [3]:
# Ленивое чтение train.parquet
train = pl.scan_parquet("/kaggle/input/avitotech-data/user_ads_clicks_meta_collect.parquet")
test = pl.scan_parquet("/kaggle/input/avitotech-data/test.parquet")

In [4]:
train.collect_schema()

Schema([('user_id', Int64),
        ('adv_campaign_id', Int64),
        ('target', Int32),
        ('start_date', Date),
        ('end_date', Date),
        ('goal_cost', Float64),
        ('goal_budget', Float64),
        ('location_id', Int64),
        ('logcat_id', Int64)])

In [5]:
train = train.collect()

In [6]:
train_w_features = train.with_columns(
    (pl.col("end_date") - pl.col("start_date")).alias("campaign_duration")
)

train_w_features = train_w_features.with_columns(
    (pl.col("goal_budget") / pl.col("goal_cost")).alias("avg_cost")
)

In [7]:
train_w_features.collect_schema()

Schema([('user_id', Int64),
        ('adv_campaign_id', Int64),
        ('target', Int32),
        ('start_date', Date),
        ('end_date', Date),
        ('goal_cost', Float64),
        ('goal_budget', Float64),
        ('location_id', Int64),
        ('logcat_id', Int64),
        ('campaign_duration', Duration(time_unit='ms')),
        ('avg_cost', Float64)])

In [8]:
train_w_features.head()

user_id,adv_campaign_id,target,start_date,end_date,goal_cost,goal_budget,location_id,logcat_id,campaign_duration,avg_cost
i64,i64,i32,date,date,f64,f64,i64,i64,duration[ms],f64
2197959,2184,0,2024-09-03,2024-09-16,2.447497,7424.750349,46,40,13d,3033.609918
166290,288,0,2024-09-06,2024-09-30,8.1499,837.688758,61,21,24d,102.785161
900510,4041,0,2024-09-18,2024-09-24,2.668961,7032.445149,58,40,6d,2634.899454
2844712,181,0,2024-08-22,2024-09-19,6.566288,28610.462228,63,18,28d,4357.174315
483345,3939,0,2024-09-11,2024-09-30,3.952216,1074.103988,42,50,19d,271.772573


In [9]:
# Определение сезонов
def determine_season(month):
    return (
        pl.when((month >= 3) & (month <= 5)).then(1)
        .when((month >= 6) & (month <= 8)).then(2)
        .when((month >= 9) & (month <= 11)).then(3)
        .otherwise(0)
    )

# Добавление сезонов
train_w_features = train_w_features.with_columns([
    determine_season(pl.col("start_date").dt.month()).alias("start_season"),
    determine_season(pl.col("end_date").dt.month()).alias("end_season"),
])

# train_w_features

In [10]:
train_w_features['start_season'].value_counts()

start_season,count
i32,u32
3,65079941
2,7453740
1,5


In [11]:
train_w_features['end_season'].value_counts()

end_season,count
i32,u32
2,4763
1,4
3,72528919


In [12]:
train_w_features.collect_schema()

Schema([('user_id', Int64),
        ('adv_campaign_id', Int64),
        ('target', Int32),
        ('start_date', Date),
        ('end_date', Date),
        ('goal_cost', Float64),
        ('goal_budget', Float64),
        ('location_id', Int64),
        ('logcat_id', Int64),
        ('campaign_duration', Duration(time_unit='ms')),
        ('avg_cost', Float64),
        ('start_season', Int32),
        ('end_season', Int32)])

In [13]:
train_w_features.write_parquet("train_w_features.parquet")

In [14]:
test_data = pl.scan_parquet("/kaggle/input/avitotech-data/test.parquet").collect()

In [15]:
test_data

user_id,adv_campaign_id,platform_id,adv_creative_id,event_date,banner_code,is_main
i64,i64,i64,i64,date,i64,bool
2714742,3026,2,4056,2024-09-23,8,true
2714742,2994,2,3954,2024-09-23,8,true
2714742,97,2,1752,2024-09-23,8,true
2714742,3539,2,1244,2024-09-23,8,true
2714742,2756,2,2003,2024-09-23,8,true
…,…,…,…,…,…,…
2398626,1099,2,3030,2024-09-23,8,true
2398626,488,2,1324,2024-09-23,8,true
2398626,49,2,1514,2024-09-23,8,true
2398626,1053,2,4211,2024-09-23,5,false


In [16]:
train_targets = train_w_features.select(["user_id", "adv_campaign_id", "target"])

test_with_target = test_data.join(train_targets, on=["user_id", "adv_campaign_id"], how="left")

In [17]:
test_with_target['target'].value_counts()

target,count
i32,u32
,1016440
0.0,944092
1.0,22755


In [18]:
test_with_target

user_id,adv_campaign_id,platform_id,adv_creative_id,event_date,banner_code,is_main,target
i64,i64,i64,i64,date,i64,bool,i32
2714742,3026,2,4056,2024-09-23,8,true,
2714742,2994,2,3954,2024-09-23,8,true,
2714742,97,2,1752,2024-09-23,8,true,
2714742,3539,2,1244,2024-09-23,8,true,
2714742,2756,2,2003,2024-09-23,8,true,
…,…,…,…,…,…,…,…
2398626,1099,2,3030,2024-09-23,8,true,
2398626,488,2,1324,2024-09-23,8,true,0
2398626,49,2,1514,2024-09-23,8,true,
2398626,1053,2,4211,2024-09-23,5,false,


In [19]:
test_unknown = test_with_target.filter(pl.col("target").is_null())

In [20]:
print(test_with_target.shape)
print(test_unknown.shape)

(1983287, 8)
(1016440, 8)


In [21]:
import torch

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


  print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')


In [22]:
train_w_features.head()

user_id,adv_campaign_id,target,start_date,end_date,goal_cost,goal_budget,location_id,logcat_id,campaign_duration,avg_cost,start_season,end_season
i64,i64,i32,date,date,f64,f64,i64,i64,duration[ms],f64,i32,i32
2197959,2184,0,2024-09-03,2024-09-16,2.447497,7424.750349,46,40,13d,3033.609918,3,3
166290,288,0,2024-09-06,2024-09-30,8.1499,837.688758,61,21,24d,102.785161,3,3
900510,4041,0,2024-09-18,2024-09-24,2.668961,7032.445149,58,40,6d,2634.899454,3,3
2844712,181,0,2024-08-22,2024-09-19,6.566288,28610.462228,63,18,28d,4357.174315,2,3
483345,3939,0,2024-09-11,2024-09-30,3.952216,1074.103988,42,50,19d,271.772573,3,3


In [23]:
train_w_features.collect_schema()

Schema([('user_id', Int64),
        ('adv_campaign_id', Int64),
        ('target', Int32),
        ('start_date', Date),
        ('end_date', Date),
        ('goal_cost', Float64),
        ('goal_budget', Float64),
        ('location_id', Int64),
        ('logcat_id', Int64),
        ('campaign_duration', Duration(time_unit='ms')),
        ('avg_cost', Float64),
        ('start_season', Int32),
        ('end_season', Int32)])

In [24]:
import cuml
from cuml.ensemble import RandomForestClassifier
import cupy as cp  # Для работы с данными на GPU
import cudf  # Аналог pandas на GPU

In [25]:
# # Преобразование Polars в Pandas, затем в cuDF
# df_pandas = train_w_features.collect().to_pandas()
# df_cudf = cudf.DataFrame.from_pandas(df_pandas)

# # Разделение признаков и целевой переменной
# X = df_cudf.drop("target", axis=1)
# y = df_cudf["target"]

X = train_w_features.drop(['user_id', 'adv_campaign_id', 'target', 'start_date', 'end_date'])
y = train_w_features['target']

print(X.shape)
print(y.shape)

(72533686, 8)
(72533686,)


In [26]:
%%time

import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Creating an XGBoost classifier
model = xgb.XGBClassifier(device="cuda")

#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

CPU times: user 1min 35s, sys: 21.8 s, total: 1min 57s
Wall time: 1min 13s


In [27]:
#Calculating accuracy
accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.9916685611886008

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00  14385876
           1       0.00      0.00      0.00    120862

    accuracy                           0.99  14506738
   macro avg       0.50      0.50      0.50  14506738
weighted avg       0.98      0.99      0.99  14506738



  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print(test_unknown.shape)

(1016440, 8)


In [29]:
# Ленивое чтение train.parquet
campaigns_meta = pl.scan_parquet("/kaggle/input/avitotech-data/campaigns_meta.parquet").collect()

test_unknown = test_unknown.join(campaigns_meta, on="adv_campaign_id", how="left")

print(test_unknown.shape)

(1016440, 14)


In [30]:
test_unknown = test_unknown.with_columns(
    (pl.col("end_date") - pl.col("start_date")).alias("campaign_duration")
)

test_unknown = test_unknown.with_columns(
    (pl.col("goal_budget") / pl.col("goal_cost")).alias("avg_cost")
)

In [31]:
# Определение сезонов
def determine_season(month):
    return (
        pl.when((month >= 3) & (month <= 5)).then(1)
        .when((month >= 6) & (month <= 8)).then(2)
        .when((month >= 9) & (month <= 11)).then(3)
        .otherwise(0)
    )

# Добавление сезонов
test_unknown = test_unknown.with_columns([
    determine_season(pl.col("start_date").dt.month()).alias("start_season"),
    determine_season(pl.col("end_date").dt.month()).alias("end_season"),
])

In [32]:
ids_test_unknown = test_unknown[['user_id', 'adv_campaign_id', 'platform_id', 'adv_creative_id']]
X_test_unknown = test_unknown.drop(['user_id', 'adv_campaign_id', 'target', 'start_date', 'end_date', 'event_date', 'banner_code', 'is_main'])
X_test_unknown.shape

(1016440, 10)

In [33]:
X_test_unknown = X_test_unknown.drop(['platform_id', 'adv_creative_id'])
X_test_unknown.head()

goal_cost,goal_budget,location_id,logcat_id,campaign_duration,avg_cost,start_season,end_season
f64,f64,i64,i64,duration[ms],f64,i32,i32
4.754986,5080.923372,1,65,6d,1068.546422,3,3
7.819801,55768.516996,30,65,14d,7131.705019,3,3
5.138341,10024.341068,1,65,6d,1950.890659,3,3
5.058388,9925.29232,1,65,6d,1962.145426,3,3
4.827791,19293.322118,30,65,9d,3996.304353,3,3


In [34]:
preds_X_test_unknown = model.predict(X_test_unknown)

In [35]:
ids_test_unknown = ids_test_unknown.with_columns(
    pl.Series(name="target", values=preds_X_test_unknown)
)

In [36]:
ids_test_unknown

user_id,adv_campaign_id,platform_id,adv_creative_id,target
i64,i64,i64,i64,i64
2714742,3026,2,4056,0
2714742,2994,2,3954,0
2714742,97,2,1752,0
2714742,3539,2,1244,0
2714742,2756,2,2003,0
…,…,…,…,…
2398626,1714,2,3734,0
2398626,1099,2,3030,0
2398626,49,2,1514,0
2398626,1053,2,4211,0


In [37]:
ids_test_unknown

user_id,adv_campaign_id,platform_id,adv_creative_id,target
i64,i64,i64,i64,i64
2714742,3026,2,4056,0
2714742,2994,2,3954,0
2714742,97,2,1752,0
2714742,3539,2,1244,0
2714742,2756,2,2003,0
…,…,…,…,…
2398626,1714,2,3734,0
2398626,1099,2,3030,0
2398626,49,2,1514,0
2398626,1053,2,4211,0


In [38]:
test_with_target.collect_schema()

Schema([('user_id', Int64),
        ('adv_campaign_id', Int64),
        ('platform_id', Int64),
        ('adv_creative_id', Int64),
        ('event_date', Date),
        ('banner_code', Int64),
        ('is_main', Boolean),
        ('target', Int32)])

In [39]:
test_with_target = test_with_target.with_columns([
    pl.col("target").cast(pl.Int64)
])


In [40]:
ids_test_unknown.collect_schema()

Schema([('user_id', Int64),
        ('adv_campaign_id', Int64),
        ('platform_id', Int64),
        ('adv_creative_id', Int64),
        ('target', Int64)])

In [41]:
# test_unknown = test_with_target.filter(pl.col("target").is_null())
test_with_target_ans1 = test_with_target.select(["user_id", "adv_campaign_id", "target"])
test_with_target_ans2= ids_test_unknown.select(["user_id", "adv_campaign_id", "target"])

test_final = (
    test_with_target_ans1.filter(pl.col("target").is_not_null())  # Известные целевые значения
    .vstack(test_with_target_ans2)  # Добавляем предсказания для неизвестных target
)

In [42]:
test_final

user_id,adv_campaign_id,target
i64,i64,i64
1201115,958,0
854529,3700,0
854529,829,0
854529,4100,0
854529,3846,0
…,…,…
2398626,1714,0
2398626,1099,0
2398626,49,0
2398626,1053,0


In [43]:
test = test.collect()
test.shape

(1983287, 7)

In [44]:
test_with_target_final = test.join(
    test_final.select(["user_id", "adv_campaign_id", "target"]),
    on=["user_id", "adv_campaign_id"],
    how="left"  # Добавляем только целевой столбец
)

In [45]:
test_with_target_final.select(["user_id", "adv_campaign_id", "target"]).write_csv(
    "submition_v1.csv",
    separator=",",
)