# Feature engineering

In [1]:
data_path = '/home/pavel/P/kaggle_data/talking_data'

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [3]:
import os
import joblib

### Формируем общий датафрэйм

In [4]:
gender_age_train = pd.read_csv(os.path.join(data_path, 'gender_age_train.csv'), dtype={'device_id': np.int64,
                                                                                       'age': np.int8})

gender_train = gender_age_train['gender']
age_train = gender_age_train['age']
group_train = gender_age_train['group']
gender_age_train = gender_age_train.drop(['gender', 'age', 'group'], axis=1)

gender_age_test = pd.read_csv(os.path.join(data_path, 'gender_age_test.csv'), dtype={'device_id': np.int64,
                                                                                      'age': np.int8})

train_len = gender_age_train.shape[0]

gender_age = pd.concat((gender_age_train, gender_age_test), axis=0)

assert(gender_age.shape[0] == (gender_age_train.shape[0] + gender_age_test.shape[0]))

In [5]:
gender_age.head()

Unnamed: 0,device_id
0,-8076087639492063270
1,-2897161552818060146
2,-8260683887967679142
3,-4938849341048082022
4,245133531816851882


## Поехали!

### Общее кол-во ивентов для девайса

In [6]:
events = pd.read_csv(os.path.join(data_path, 'events.csv'), dtype={'event_id': np.int64, 'device_id': np.int64,
                                                                  'longitude': np.float16, 'latitude': np.float16})

In [7]:
device_event_count = events.pivot_table('event_id', 'device_id', aggfunc='count')
device_event_count_df = pd.DataFrame(zip(device_event_count.index, device_event_count.values),
                                     columns=('device_id', 'device_event_count'))

In [8]:
all_data = pd.merge(gender_age, device_event_count_df, on='device_id', how='left')

In [9]:
all_data.head()

Unnamed: 0,device_id,device_event_count
0,-8076087639492063270,
1,-2897161552818060146,
2,-8260683887967679142,1.0
3,-4938849341048082022,
4,245133531816851882,


### Среднее кол-во установленных аппов для девайса

In [10]:
app_events = pd.read_csv(os.path.join(data_path, 'app_events.csv'),
                         usecols=('is_installed', 'is_active', 'event_id', 'app_id'),
                         dtype={'is_installed': np.int8, 'is_active': np.int8, 'event_id': np.int64, 'app_id': np.int64})

In [11]:
event_app_count = app_events.pivot_table('app_id', 'event_id', aggfunc='count')

In [12]:
event_app_count_df = pd.DataFrame(zip(event_app_count.index, event_app_count.values),
                                  columns=('event_id', 'event_app_count'))

device_app_mean = pd.merge(events, event_app_count_df, on='event_id', how='inner')\
    .pivot_table('event_app_count', 'device_id', aggfunc='mean')
device_app_mean_df = pd.DataFrame(zip(device_app_mean.index, device_app_mean.values),
                                  columns=('device_id', 'device_app_mean'))

In [13]:
all_data = pd.merge(all_data, device_app_mean_df, on='device_id', how='left')

In [14]:
all_data.head()

Unnamed: 0,device_id,device_event_count,device_app_mean
0,-8076087639492063270,,
1,-2897161552818060146,,
2,-8260683887967679142,1.0,53.0
3,-4938849341048082022,,
4,245133531816851882,,


### Среднее кол-во активных аппов для девайса

In [15]:
event_active_app_count = app_events[app_events['is_active']==1].pivot_table('app_id', 'event_id', aggfunc='count')
event_active_app_count_df = pd.DataFrame(zip(event_active_app_count.index, event_active_app_count.values),
                                         columns=('event_id', 'event_active_app_count'))

device_active_app_mean = pd.merge(events, event_active_app_count_df, on='event_id', how='inner')\
    .pivot_table('event_active_app_count', 'device_id', aggfunc='mean')

device_active_app_mean_df = pd.DataFrame(zip(device_active_app_mean.index, device_active_app_mean.values),
                                         columns=('device_id', 'device_active_app_mean'))

In [16]:
all_data = pd.merge(all_data, device_active_app_mean_df, on='device_id', how='left')

In [17]:
all_data.head()

Unnamed: 0,device_id,device_event_count,device_app_mean,device_active_app_mean
0,-8076087639492063270,,,
1,-2897161552818060146,,,
2,-8260683887967679142,1.0,53.0,6.0
3,-4938849341048082022,,,
4,245133531816851882,,,


### Разброс координат

In [18]:
device_lon_std = events.pivot_table('longitude', 'device_id', aggfunc='std')
device_lon_std_df = pd.DataFrame(zip(device_lon_std.index, device_lon_std.values),
                                 columns=('device_id', 'lon_std'))

device_lat_std = events.pivot_table('latitude', 'device_id', aggfunc='std')
device_lat_std_df = pd.DataFrame(zip(device_lat_std.index, device_lat_std.values),
                                 columns=('device_id', 'lat_std'))

In [19]:
all_data = pd.merge(all_data, device_lon_std_df, on='device_id', how='left')
all_data = pd.merge(all_data, device_lat_std_df, on='device_id', how='left')

In [20]:
all_data.head()

Unnamed: 0,device_id,device_event_count,device_app_mean,device_active_app_mean,lon_std,lat_std
0,-8076087639492063270,,,,,
1,-2897161552818060146,,,,,
2,-8260683887967679142,1.0,53.0,6.0,,
3,-4938849341048082022,,,,,
4,245133531816851882,,,,,
