In [1]:
%%capture
!gdown 1ABZCRp4uZxxMe3DcLGVqqg9HR8WK27q9
!unzip data.zip
!pip install -U deepctr-torch torchmetrics

In [None]:
import pandas as pd
import torch
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

train = pd.read_csv("/content/Ad_click_prediction_train(1).csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [76]:
#посмотрим на кол-во уникальных значений в каждом из столбцов
dict_ = {column: [0, 0] for column in train.columns}
for column in train.columns:
  dict_[column][0] = len(train[column].unique())
  dict_[column][1] = train[column].shape[0]
df = pd.DataFrame.from_dict(dict_)
df

In [None]:
#кол-во пропущенных значений в каждом из столбцов и процентиль
df_na = pd.concat([train.isna().sum(), train.count()], axis=1)
df_na = pd.concat([df_na, (train.isna().sum() / train.shape[0])], axis=1)

In [237]:
import numpy as np

#уберём колонки, которые создают лишний шум
train = train.drop(columns=["product_category_2", "session_id"])
#переведём время в нужный формат
train["DateTime"] = pd.to_datetime(train["DateTime"]).dt.hour
"""
  import time
  import datetime
  s = "01/12/2011"
  time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple())
"""

#заполняем пропущенные значения
train["city_development_index"].fillna(value=train["city_development_index"].mean(), inplace=True)

#выкидываем обьекты с оставшимеся пропущенными значениями
train.dropna(inplace=True)

#разделяем данные, чтобы кол-во данных с разными лейблами было одинаковое
train_1 = train[train.is_click == 1]
train_0 = train[train.is_click == 0].sample(n = train_1.shape[0])
train = pd.concat([train_1, train_0], axis=0)

In [242]:
#разделяем фичи на категориальные и числовые
target = ["is_click"]

sparse_features = list(train.drop(columns=target).columns)
dense_features = []

#кодируем кат.фичи
for feature in sparse_features:
  lbe = LabelEncoder()
  train[feature] = lbe.fit_transform(train[feature])

#нормализируем чисовые признаки
mms = MinMaxScaler(feature_range=(0, 1))
train[dense_features] = mms.fit_transform(train[dense_features])

In [244]:
fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique()) for feat in sparse_features] \
                       + [DenseFeat(feat, 1, ) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [246]:
train_ds, test_ds = train_test_split(train, test_size=0.05)

train_model_input = {name: train_ds[name] for name in feature_names}
test_model_input = {name: test_ds[name] for name in feature_names}

In [272]:
#обучаем нейронку c FM
model = DeepFM(
                linear_feature_columns=linear_feature_columns,
                dnn_feature_columns=dnn_feature_columns,
                task='binary',
                l2_reg_embedding=1, device=device
              )

model.compile(
                torch.optim.AdamW(model.parameters(), lr=3e-5),
                "binary_crossentropy",
                metrics=["acc"]
              )

model.fit(
            x=train_model_input,
            y=train_ds[target].values.reshape(1, -1)[0],
            shuffle=True,
            batch_size=16,
            epochs=8,
            verbose=2,
            validation_split=0.08
          )

In [299]:
#обучаем свертку для кликов
model = ccpm.CCPM(
                  linear_feature_columns=linear_feature_columns,
                  dnn_feature_columns=dnn_feature_columns,
                  task='binary', l2_reg_embedding=2e-4,
                  device=device,
                  dnn_dropout=0.1
                )

model.compile(
               torch.optim.Adagrad(model.parameters(), lr=3e-3),
               "binary_crossentropy",
               metrics=["acc"]
             )

model.fit(
            x=train_model_input,
            y=train_ds[target].values,
            shuffle=True,
            batch_size=128,
            epochs=15,
            verbose=2,
            validation_split=0.08
          )

In [280]:
#получаем предсказания
from torchmetrics.classification import BinaryF1Score
from torchmetrics import Precision, Recall
from sklearn.metrics import accuracy_score

pred_ans = model.predict(test_model_input, 64)

f1 = BinaryF1Score()
precision = Precision(task="multiclass", num_classes=2)
recall = Recall(task="binary")

print("Precision: ", precision(torch.tensor(pred_ans).reshape(1, -1)[0].cpu(), torch.tensor(test_ds[target].values).reshape(1, -1)[0].cpu()))
print("Recall:", recall(torch.tensor(pred_ans).reshape(1, -1)[0], torch.tensor(test_ds[target].values).reshape(1, -1)[0]))
print("test F1:", f1(torch.tensor(pred_ans).reshape(1, -1)[0], torch.tensor(test_ds[target].values).reshape(1, -1)[0]).item())
print("test ACC:", accuracy_score(test_ds[target].values, pred_ans))