In [1]:
import pandas as pd
from scipy.stats import entropy
from sklearn.metrics import log_loss
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data = pd.read_csv('/content/drive/MyDrive/data.csv')
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [4]:
drop_columns = ['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1']
data = data.drop(columns=drop_columns)
data = data.drop(columns=['impressions'])

In [5]:
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
0,2021-09-27 00:01:30.000000,0,0,0,0,0,1
1,2021-09-26 22:54:49.000000,1,1,0,0,1,1
2,2021-09-26 23:57:20.000000,2,2,3,0,0,1
3,2021-09-27 00:04:30.000000,3,3,0,1,1,1
4,2021-09-27 00:06:21.000000,4,4,0,1,0,1


In [6]:
data.describe()

Unnamed: 0,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
count,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0
mean,81.52679,381.6483,0.623854,1.840605,4.346986,0.02668835
std,163.2448,395.9386,9.249152,1.530005,4.317701,0.161171
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,52.0,0.0,1.0,0.0,0.0
50%,19.0,217.0,0.0,2.0,4.0,0.0
75%,60.0,611.0,0.0,3.0,7.0,0.0
max,3443.0,1632.0,829.0,10.0,16.0,1.0


# Select last day data for grading

In [7]:
def find_last_day(df, datetime_column):
    max_date = datetime.fromisoformat(df[datetime_column][0]).date()
    for datetime_iso_str in df[datetime_column]:
        max_date = max(max_date, datetime.fromisoformat(datetime_iso_str).date())
    return max_date

In [8]:
last_day = find_last_day(data, 'date_time')
data_train = data[data.apply(lambda x: datetime.fromisoformat(x['date_time']).date() < last_day, axis=1)]
data_grade = data[data.apply(lambda x: datetime.fromisoformat(x['date_time']).date() >= last_day, axis=1)]

In [9]:
data_train = data_train.drop(columns=['date_time'])
data_grade = data_grade.drop(columns=['date_time'])

In [10]:
print(data_train.shape)
print(data_grade.shape)

(13692494, 6)
(2128978, 6)


In [11]:
data_train.head()

Unnamed: 0,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
0,0,0,0,0,0,1
1,1,1,0,0,1,1
2,2,2,3,0,0,1
3,3,3,0,1,1,1
4,4,4,0,1,0,1


# Compress campaign_clicks

Уменьшим число различных значений campaign_clicks

In [12]:
def simplify_campaign_clicks(campaign_clicks=None, **kwargs):
    if campaign_clicks <= 0:
        return 0
    elif campaign_clicks <= 1:
        return 1
    elif campaign_clicks <= 2:
        return 2
    elif campaign_clicks <= 4:
        return 3
    elif campaign_clicks <= 8:
        return 4
    elif campaign_clicks <= 16:
        return 5
    else:
        return 6

In [13]:
def apply_function_to_df(df, func):
    columns_list = list(df.columns.values)
    return df.apply(lambda x: func(**{column_name: x[column_name] for column_name in columns_list}), axis=1)

In [14]:
data_train['campaign_clicks'] = apply_function_to_df(data_train, simplify_campaign_clicks)

data_grade['campaign_clicks'] = apply_function_to_df(data_grade, simplify_campaign_clicks)

# Apply OneHotEncoder

In [15]:
X = data_train.drop('clicks', axis=1)
y = data_train['clicks']

X_grade = data_grade.drop('clicks', axis=1)
y_grade = data_grade['clicks']

In [16]:
enc = OneHotEncoder(handle_unknown='ignore')

In [17]:
enc.fit(X)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [18]:
X = enc.transform(X)

X_grade = enc.transform(X_grade)

In [19]:
X

<13692494x4951 sparse matrix of type '<class 'numpy.float64'>'
	with 68462470 stored elements in Compressed Sparse Row format>

In [20]:
X_grade

<2128978x4951 sparse matrix of type '<class 'numpy.float64'>'
	with 10503761 stored elements in Compressed Sparse Row format>

# Split into test and train

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic regression

In [22]:
logreg = LogisticRegression(class_weight='balanced', max_iter=1000000000)
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000000000, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

### Check train

In [23]:
y_pred = logreg.predict(X_train)
acc_result = round(logreg.score(X_train, y_train) * 100, 2)
print(f'Accuracy for train {acc_result}%')

y_proba = logreg.predict_proba(X_train)
log_loss_result = log_loss(y_true=y_train, y_pred=y_proba, labels=[0, 1])
print(f'Log-loss for train {log_loss_result}')

Accuracy for train 68.41%
Log-loss for train 0.5771101285078447


### Check test

In [24]:
y_pred = logreg.predict(X_test)
acc_result = round(logreg.score(X_test, y_test) * 100, 2)
print(f'Accuracy for test {acc_result}%')

y_proba = logreg.predict_proba(X_test)
log_loss_result = log_loss(y_true=y_test, y_pred=y_proba, labels=[0, 1])
print(f'Log-loss for test {log_loss_result}')

Accuracy for test 68.38%
Log-loss for test 0.577816935680614


### Check final day

In [25]:
y_pred = logreg.predict(X_grade)
acc_result = round(logreg.score(X_grade, y_grade) * 100, 2)
print(f'Accuracy for last day {acc_result}%')

y_proba = logreg.predict_proba(X_grade)
log_loss_result = log_loss(y_true=y_grade, y_pred=y_proba, labels=[0, 1])
print(f'Log-loss for last day {log_loss_result}')

Accuracy for last day 62.06%
Log-loss for last day 0.6598058786300385
