In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss

загружаем датасет

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df = pd.read_csv('drive/MyDrive/data.csv')

проверяем что всё хорошо

In [None]:
df.head()

удаляем лишние колонки

In [7]:
df.drop(['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 
         'coeff_sum0', 'coeff_sum1'], inplace=True, axis=1)

In [8]:
df.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,0,0,0,1,1
1,2021-09-26 22:54:49.000000,1,1,0,0,1,1,1
2,2021-09-26 23:57:20.000000,2,2,3,0,0,1,1
3,2021-09-27 00:04:30.000000,3,3,0,1,1,1,1
4,2021-09-27 00:06:21.000000,4,4,0,1,0,1,1


проверяем уникальность impressions и clicks

In [18]:
np.unique(df.impressions)

array([1])

In [19]:
np.unique(df.clicks)

array([0, 1])

проверяем на пропуски

In [21]:
df.isna().sum()

date_time          0
zone_id            0
banner_id          0
campaign_clicks    0
os_id              0
country_id         0
impressions        0
clicks             0
dtype: int64

In [None]:
удаляем impressions потому что там одни единицы, следовательно этот столбец никак не повлияет на улучшение качества модели

In [22]:
df.drop(['impressions'], inplace=True, axis=1)

In [23]:
df.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
0,2021-09-27 00:01:30.000000,0,0,0,0,0,1
1,2021-09-26 22:54:49.000000,1,1,0,0,1,1
2,2021-09-26 23:57:20.000000,2,2,3,0,0,1
3,2021-09-27 00:04:30.000000,3,3,0,1,1,1
4,2021-09-27 00:06:21.000000,4,4,0,1,0,1


загрузим данные в нормальном виде, чтобы дата была датой, а клики числом

In [4]:
df1 = pd.read_csv('drive/MyDrive/data.csv', parse_dates=['date_time'], dtype={'clicks':int})

In [5]:
df1.drop(['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 
         'coeff_sum0', 'coeff_sum1', 'impressions'], inplace=True, axis=1)

In [6]:
df1.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1
1,2021-09-26 22:54:49,1,1,0,0,1,1
2,2021-09-26 23:57:20,2,2,3,0,0,1
3,2021-09-27 00:04:30,3,3,0,1,1,1
4,2021-09-27 00:06:21,4,4,0,1,0,1


посмотрим на уникальные дни

In [7]:
df1 = df1.set_index('date_time').sort_index()
np.unique(df1.index.date)

array([datetime.date(2021, 9, 1), datetime.date(2021, 9, 26),
       datetime.date(2021, 9, 27), datetime.date(2021, 9, 28),
       datetime.date(2021, 9, 29), datetime.date(2021, 9, 30),
       datetime.date(2021, 10, 1), datetime.date(2021, 10, 2)],
      dtype=object)

день 2021-09-01 слишком далеко от остальных, возможно эти данные уже не актуальны и не стоит их учитывать

In [8]:
df1.loc['2021-09-02':].copy()

Unnamed: 0_level_0,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-26 00:00:00,41,29,1,3,0,0
2021-09-26 00:00:00,1,188,2,2,15,0
2021-09-26 00:00:00,17,52,2,2,5,0
2021-09-26 00:00:00,47,73,1,4,13,0
2021-09-26 00:00:00,48,266,1,0,1,0
...,...,...,...,...,...,...
2021-10-02 23:59:59,24,180,0,2,6,0
2021-10-02 23:59:59,73,92,0,1,0,0
2021-10-02 23:59:59,17,1235,0,4,0,0
2021-10-02 23:59:59,1,2,0,0,0,0


как видим после 2021-09-01 идёт аж 2021-09-26

In [9]:
df1.head()

Unnamed: 0_level_0,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-01 00:02:49,30,596,0,0,7,0
2021-09-26 00:00:00,41,29,1,3,0,0
2021-09-26 00:00:00,1,188,2,2,15,0
2021-09-26 00:00:00,17,52,2,2,5,0
2021-09-26 00:00:00,47,73,1,4,13,0


разделяем на X и y и нормализуем campaign_clicks




In [10]:
y = df1['clicks']
X = df1.drop(['clicks'], axis=1)

to_normalize = X[["campaign_clicks"]].values.astype(float)
normalized_X = normalize(to_normalize)



In [11]:
y.head()

date_time
2021-09-01 00:02:49    0
2021-09-26 00:00:00    0
2021-09-26 00:00:00    0
2021-09-26 00:00:00    0
2021-09-26 00:00:00    0
Name: clicks, dtype: int64

In [12]:
X.head()

Unnamed: 0_level_0,zone_id,banner_id,campaign_clicks,os_id,country_id
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-09-01 00:02:49,30,596,0,0,7
2021-09-26 00:00:00,41,29,1,3,0
2021-09-26 00:00:00,1,188,2,2,15
2021-09-26 00:00:00,17,52,2,2,5
2021-09-26 00:00:00,47,73,1,4,13


нам не подходит стандартный test_train_split так как мы проверяем только последний день. Если открыть csv то можно найти нужную нам строку для разделения данных

In [17]:
df1[1:13692494]

Unnamed: 0_level_0,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-26 00:00:00,41,29,1,3,0,0
2021-09-26 00:00:00,1,188,2,2,15,0
2021-09-26 00:00:00,17,52,2,2,5,0
2021-09-26 00:00:00,47,73,1,4,13,0
2021-09-26 00:00:00,48,266,1,0,1,0
...,...,...,...,...,...,...
2021-10-01 23:59:59,254,584,0,2,10,0
2021-10-01 23:59:59,34,47,1,2,5,0
2021-10-01 23:59:59,3,1239,0,2,0,0
2021-10-01 23:59:59,139,49,0,0,0,0


In [14]:
split = 13692494

выкидываем первое число т.к. оно относится к 2021-09-01

In [15]:
X_train, X_test = X[1: split], X[split:]
y_train, y_test = y[1: split], y[split:]

обучаем модель логистической регрессии

In [16]:
model = LogisticRegressionCV(solver='liblinear', scoring = 'neg_log_loss')
model.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True,
                     scoring='neg_log_loss', solver='liblinear', tol=0.0001,
                     verbose=0)

делаем предсказание

In [18]:
prediction = model.predict_proba(X_test)
logloss = log_loss(y_test, prediction)

In [19]:
logloss

0.1485542749253761

сравниванием с предсказание по среднему

In [20]:
log_loss(y_test, [(1 - y_train.mean(), y_train.mean())] * len(y_test))

0.15486197934508383

logloss вышел лучше чем у предсказания по среднему