In [1]:
!cat ../../task_1_recsys/README.md

# Домашняя работа 1 (4 балла)
## Дедлайн
Работы принимаются до **06.11 12:00**.

## Требования
Работа должна быть выполнена на языке Python (версия не менее 3.7). Код должен быть написан аккуратно и читаемо, названия переменных должны отражать свою суть. Так же следует оставлять пояснения к своим подходам к решению.

Результат работы - ноутбук с вызываемым рабочим кодом и комментариями-пояснениями. Весь код должен быть закоммичен и создан Pull Request. На сайте курса, в разделе домашнии работы, **нужно отписаться**, что Ваш PR готов к проверке и прислать ссылку на PR. 

Примерный шаблон ноутбука приложен в папке task_1_recsys.

## Отправка ДЗ
Репозиторий: https://github.com/sergeyivanovgit/Recsys-course-homework

Правила создания директорий: 
- Создайте папку в формате фамилия.первая_буква_имени (например, ivanov.s). В ней создайте папку hw1.
- В папке hw1 должен быть результирующий нотбук, названный фамилия_первая_буква_имени.ipynb (например, ivanov_s.ipynb), и необход

# Read the data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# unnecessary columns for this task

task_quote = r'oaid_hash, banner_id0, banner_id1, rate0, rate1, g0, g1, coeff_sum0, coeff_sum1'
cols_for_removal = list(map(str,task_quote.replace(',', '').split()))

In [5]:
from functools import wraps
from time import time

def logger(func):
    """\
    Decorator for timing and logging 
    Pandas data conversion pipelines"""
    @wraps(func)
    def inner(df, *a, **kwa):
        shape_before = df.shape
        ts = time()
        result = func(df, *a, **kwa)
        te = time()
        print(f'{func.__name__} took {te-ts:2.4f} sec')
        print(f'    shape {shape_before} -> {result.shape}')
        return result
    return inner

In [6]:
def load_data(path='./data-001.csv', *a, **kwa):
    return pd.read_csv(path, *a, **kwa)

@logger
def start_pipeline(df):
    return df.copy(deep=True)

@logger
def remove_cols_for_removal(df):
    return df.drop(cols_for_removal, axis=1)

def print_unique_vals(dataf):
    for col in dataf.columns:
        if col != 'date_time':
            print(f'{col}: {len(dataf[col].unique())}')

In [7]:
%%time

raw_data = (
#     load_data(nrows=100)
#     load_data(nrows=10_000)
    load_data(nrows=None)
    .pipe(remove_cols_for_removal)
)

remove_cols_for_removal took 0.2696 sec
    shape (15821472, 17) -> (15821472, 8)
CPU times: user 18.5 s, sys: 1.81 s, total: 20.3 s
Wall time: 23 s


In [8]:
raw_data.dtypes

date_time          object
zone_id             int64
banner_id           int64
campaign_clicks     int64
os_id               int64
country_id          int64
impressions         int64
clicks              int64
dtype: object

Let's first convert data to more appropriate data types. Namely:

    - date_time: `pd.to_datetime()`
    - zone_id, banner_id, campaign_clicks, os_id, country_id: int16
    - impressions: bool
    - clicks: bool

In [9]:
@logger
def manual_dtypes(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    
    bool_cols = ['clicks', 'impressions']
    for col in bool_cols:
        df[col] = df[col].astype(bool)
    
    int16_cols = ['zone_id','banner_id','campaign_clicks','os_id','country_id']
    for col in int16_cols:
        df[col] = df[col].astype(np.int16)
    
    return df

In [10]:
@logger
def drop_impressions(df):
    return df.drop('impressions', axis=1)

In [11]:
df = (
    raw_data
    .pipe(start_pipeline)
    .pipe(manual_dtypes)
    .pipe(drop_impressions)
)

start_pipeline took 0.2384 sec
    shape (15821472, 8) -> (15821472, 8)
manual_dtypes took 2.1864 sec
    shape (15821472, 8) -> (15821472, 8)
drop_impressions took 0.0817 sec
    shape (15821472, 8) -> (15821472, 7)


In [12]:
# see that our choice of data type was justified
print_unique_vals(df)

zone_id: 3444
banner_id: 1633
campaign_clicks: 822
os_id: 11
country_id: 17
clicks: 2


- date_time - время показа рекламы
- zone_id - id зоны, где зона - место на сайте для размещения рекламы
- banner_id - id баннера
- campaign_clicks - общее количество показов данной кампании (которой соотвествует баннер) данному юзеру, произошедшие до текущего показа
- os_id - id операционной системы
- country_id - id страны
- clicks - был ли клик

Let's split features by user and banner ones

In [13]:
user_features = ['os_id','country_id','campaign_clicks']
banner_features = ['banner_id','zone_id']

Let's add some trivial features with the datetime

Since we don't know the local timezone, we'll use day and month features only

In [14]:
@logger
def add_datetime_features(df):
    props = ['dayofweek',
             'week',  # eight weeks
             'month', # only two months in dataset
            ]
    
    dt_object = df['date_time'].dt
    for prop in props:
        df[prop] = getattr(dt_object, prop).astype(np.int16)
    
    df['is_weekend'] = dt_object.weekday > 5
    return df

In [15]:
%%time

df = (
    raw_data
    .pipe(start_pipeline)
    .pipe(manual_dtypes)
    .pipe(drop_impressions)
    .pipe(add_datetime_features)
    .drop('date_time', axis=1)
)

start_pipeline took 0.2420 sec
    shape (15821472, 8) -> (15821472, 8)
manual_dtypes took 1.8547 sec
    shape (15821472, 8) -> (15821472, 8)
drop_impressions took 0.0801 sec
    shape (15821472, 8) -> (15821472, 7)


  df[prop] = getattr(dt_object, prop).astype(np.int16)


add_datetime_features took 4.8468 sec
    shape (15821472, 7) -> (15821472, 11)
CPU times: user 7.04 s, sys: 159 ms, total: 7.2 s
Wall time: 7.16 s


Let's calculate how much variables we'll have once we OHE the categorical features

In [16]:
print_unique_vals(df)

zone_id: 3444
banner_id: 1633
campaign_clicks: 822
os_id: 11
country_id: 17
clicks: 2
dayofweek: 7
week: 3
month: 2
is_weekend: 2


In [17]:
df.shape

(15821472, 10)

It seems that, given the dataset size, OHE of all the features would be reasonable. However, the output would be now a sparse matrix, since it's enormous in size.

In [18]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(df):
    enc = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
    X = enc.fit_transform(df)
    return X, enc


In [19]:
%%time

tmp = \
(
    raw_data
    .pipe(start_pipeline)
    .pipe(manual_dtypes)
    .pipe(drop_impressions)
    .pipe(add_datetime_features)
    .drop('date_time', axis=1)
)

X, enc = one_hot_encode(tmp)
del tmp

start_pipeline took 0.2406 sec
    shape (15821472, 8) -> (15821472, 8)
manual_dtypes took 1.8444 sec
    shape (15821472, 8) -> (15821472, 8)
drop_impressions took 0.0804 sec
    shape (15821472, 8) -> (15821472, 7)


  df[prop] = getattr(dt_object, prop).astype(np.int16)


add_datetime_features took 4.8410 sec
    shape (15821472, 7) -> (15821472, 11)
CPU times: user 22.6 s, sys: 585 ms, total: 23.2 s
Wall time: 23.1 s


In [20]:
X.shape

(15821472, 5940)

Let's also produce interaction features: namely, the multiplier of these ones:

```python
user_features = ['os_id','country_id','campaign_clicks','dayofweek','week','month']
banner_features = ['banner_id','zone_id']
```

In [21]:
n_user_features = sum([1 for f in enc.get_feature_names_out() if '_'.join(f.split('_')[:-1]) in user_features])
n_banner_features = sum([1 for f in enc.get_feature_names_out() if '_'.join(f.split('_')[:-1]) in banner_features])

n_interactions = n_user_features*n_banner_features
print(f'{n_user_features=}, {n_banner_features=}, {n_interactions=}')

n_user_features=850, n_banner_features=5077, n_interactions=4315450


So total `850*5077 = 4315450` features might get added to the dataset, which way to much.

Let's for now exclude the `campaign_clicks` with ~800 possible values from the user_features, and indeed add the interaction features to the dataset

In [22]:
n_user_features = sum([1 for f in enc.get_feature_names_out() 
                       if '_'.join(f.split('_')[:-1]) in 
                       [name for name in user_features if name != 'campaign_clicks']])
n_banner_features = sum([1 for f in enc.get_feature_names_out() 
                         if '_'.join(f.split('_')[:-1]) in banner_features])

n_interactions = n_user_features*n_banner_features
print(f'{n_user_features=}, {n_banner_features=}, {n_interactions=}')

n_user_features=28, n_banner_features=5077, n_interactions=142156


In [182]:
user_features_full = \
[f for f in enc.get_feature_names_out() 
 if '_'.join(f.split('_')[:-1]) in 
 [name for name in user_features if name != 'campaign_clicks']]

banner_features_full = \
[f for f in enc.get_feature_names_out() 
 if '_'.join(f.split('_')[:-1]) in banner_features]

In [183]:
groupA = user_features_full
groupB = banner_features_full

names = list(enc.get_feature_names_out())

iA = np.array([names[i] in groupA for i in range(X.shape[1])])
iB = np.array([names[i] in groupB for i in range(X.shape[1])])
iR = np.ones(X.shape[1]).astype(bool) ^ (iA | iB)

xA = X[:,iA]
xB = X[:,iB]
xR = X[:, iR]

xA_names = np.array(names)[iA]
xB_names = np.array(names)[iB]
xR_names = np.array(names)[iR]

In [184]:
xA = X[:,iA]
xB = X[:,iB]
xR = X[:, iR]

In [185]:
xA.shape, xB.shape, xR.shape

((15821472, 28), (15821472, 5077), (15821472, 835))

In [158]:
from scipy import sparse

In [189]:
%%time
rv = []
xAB_names = []
for nA, row in zip(xA_names, xA.T):
    for nB, col in zip(xB_names, xB.T):
        xAB_names.append(f'{nA}_x_{nB}')
        rv.append(row.multiply(col).T)
print('--')
xAB = sparse.hstack(rv)

0
1


KeyboardInterrupt: 

In [178]:
np.unique()

In [284]:
sparse.hstack(X_int)

ValueError: blocks must be 2-D

In [280]:
X_int[0].shape

(1, 1000)