### Dealing with imports...

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

### Loading data...

In [2]:
events_df = pd.read_csv('/home/miki_mustard/Desktop/Facultad/Datos/TP2/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv', low_memory=False)
labels_df = pd.read_csv('/home/miki_mustard/Desktop/Facultad/Datos/TP2/fiuba-trocafone-tp2-final-set/labels_training_set.csv', low_memory=False)

## Data partition

In [3]:
# armo df con registros completos clasificados
# train_df = events_df.merge(labels_df, on='person', how='right')

In [4]:
# armo df con registros a predecir unicamente
# to_predict = events_df[~events_df['person'].isin(labels_df.person)].copy()

In [5]:
# train_df.shape

In [6]:
# labels_df.shape

In [7]:
# to_predict.shape

In [8]:
# train_df.person.nunique()

In [9]:
# to_predict.person.nunique()

In [10]:
events_df.person.nunique()

38829

## Processing

In [11]:
# first save original features to know which have been processed
init_features = list(events_df.columns)

### Dates (timestamp)

In [12]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    
date_proc(events_df)

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [13]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(events_df.weekday)

events_df.weekday = weekday_le.transform(events_df.weekday)

In [14]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [15]:
month_to_cyclic(events_df)
day_to_cyclic(events_df)
weekday_to_cyclic(events_df)
hour_to_cyclic(events_df)

In [16]:
events_df.drop('timestamp', axis=1, inplace=True)

In [17]:
init_features.remove('timestamp')

In [18]:
events_df.person.nunique()

38829

## screen_resolution

In [19]:
def get_screen_width(x):
    if x != "":
        return int(x.split("x")[0])
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return int(x.split("x")[1])
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [20]:
process_screen_res(events_df)

In [21]:
init_features.remove('screen_resolution')

## storage

In [22]:
def process_storage_string(x):
    if pd.isna(x):
        return 0
    s = x.split("GB")
    if len(s) == 2:
        # case data in GB
        return int(s[0])
    else:
        # case data in MB
        return int(x.split("MB")[0])/1024

def storage_process(df):
    df.storage = df.storage.apply(lambda x: process_storage_string(x))

In [23]:
storage_process(events_df)

In [24]:
init_features.remove('storage')

## Browser

In [25]:
# veamos valores unicos si cortamos el primer string nomas
new_list=set()
for e in list(events_df.browser_version.unique()):
    if isinstance(e, str):
        new_list.add(e.split()[0])
        
new_list

{'Android',
 'BingPreview',
 'BlackBerry',
 'Chrome',
 'Chromium',
 'Edge',
 'Facebook',
 'Firefox',
 'IE',
 'K-Meleon',
 'Maxthon',
 'Mobile',
 'Opera',
 'Other',
 'Pinterest',
 'Puffin',
 'Safari',
 'Samsung',
 'UC',
 'Vivaldi',
 'WebKit',
 'Yandex'}

Que seran...?
* Android (no es un browser en si)
* K-Meleon
* Maxthon
* Mobile
* Pinterest
* Puffin
* Samsung
* UC
* WebKit
* Yandex

Se confirma que son browsers los siguientes:
* Android (no es un browser en si)
* K-Meleon (http://kmeleonbrowser.org/)
* Maxthon (http://www.maxthon.com/)
* Mobile
* Pinterest
* Puffin (Puffin Browser is a web browser released by CloudMosa for mobile operating systems Android, iOS, Windows and Android TV.)
* Samsung (Samsung Internet for Android is a mobile web browser for smartphones and tablets developed by Samsung. It is based on the open-source Chromium project. It is pre-installed on Samsung Galaxy devices.
* UC (UC Browser is a web browser developed by the Chinese mobile Internet company UCWeb, which is in turn owned by the Alibaba Group. As of August 2018 it is the third most popular mobile browser in the world by market share, after Google Chrome and Safari.)
* WebKit (https://webkit.org/)
* Yandex: un browser (https://browser.yandex.com/)

Nos queda revisar los siguientes:
* Android (no es un browser en si)
* Mobile
* Pinterest

In [26]:
for e in list(events_df.browser_version.unique()):
    if isinstance(e,str):
        if e.lower().split()[0] in ['android','mobile','pinterest']:
            print(e)

Mobile Safari 11
Mobile Safari 9
Android 5.1
Mobile Safari 10
Android 4.4
Android 4.1
Android 4.3
Mobile Safari 8
Android 2.3
Mobile Safari 7
Android 4.0
Android 4.2
Mobile Safari UI/WKWebView 10.2
Mobile Safari UI/WKWebView 10.3
Mobile Safari 10.3
Mobile Safari 11.2
Mobile Safari UI/WKWebView 7.1
Mobile Safari 7.1
Mobile Safari 9.3
Mobile Safari UI/WKWebView 11.2
Mobile Safari 6
Pinterest
Mobile Safari 9.2
Mobile Safari 4.0
Mobile Safari UI/WKWebView 9.3
Mobile Safari 11.1
Mobile Safari 10.1
Android 7
Mobile Safari 11.0
Mobile Safari 7.0
Android 3.2
Mobile Safari UI/WKWebView 11.3
Mobile Safari 5.1


***
O sea que todos los _Mobile_ son `Safari`, todos los _Android_ son meramente eso, Android, y _Pinterest_ aparentemente es un browser tambien (?.

Conclusion: puede quedarse solamente el pirmer string del campo para representar el browser, ya que a nadie le interesa la version del mismo.
***

In [27]:
def proc_browser(x):
    if isinstance(x, str):
        return x.lower().split()[0]

In [28]:
events_df.browser_version = events_df.browser_version.apply(lambda x: proc_browser(x) if isinstance(x,str) else x)

## operating_system_version
Se procede analogamente a browser_version

In [29]:
events_df.operating_system_version.unique()

array([nan, 'Android 5.0.2', 'Ubuntu ', 'Android 7', 'Android 6.0.1',
       'Windows 7 ', 'Windows 10 ', 'iOS 11.0.3', 'Android 6',
       'Android 4.4.4', 'Android 7.1.1', 'Mac OS X 10.12.6',
       'Android 5.1', 'Windows 8.1 ', 'Android 5.1.1', 'Android 8.1',
       'Windows 8 ', 'iOS 9.3.5', 'Android 4.2.2', 'Android 5',
       'iOS 11.3', 'Android 4.1.2', 'Android 4.4.2', 'Android 5.0.1',
       'iOS 11.1.1', 'Windows XP ', 'iOS 10.3.3', 'Windows Phone 8.1',
       'Chrome OS 10452.85', 'Android 8', 'Mac OS X 10.10.4',
       'iOS 11.2.6', 'Android ', 'Android 4.3', 'Mac OS X 10.11.6',
       'Windows Vista ', 'iOS 11.1.2', 'Fedora ', 'Windows Phone 10',
       'Linux ', 'Mac OS X 10.13.4', 'Android 7.1.2', 'iOS 8.1.3',
       'iOS 11.2.1', 'Android 4.0.3', 'FreeBSD ', 'iOS 11.2.2',
       'Android 2.3.6', 'iOS 10.2.1', 'iOS 7.1.2', 'Android 4.0.4',
       'Mac OS X 10.7.5', 'Chrome OS 9901.77', 'Chrome OS 10323.67',
       'Chrome OS 10452.96', 'Other ', 'iOS 8.1.1', 'iOS 11.0.2

In [30]:
# veamos valores unicos si cortamos el primer string nomas
new_list=set()
for e in list(events_df.operating_system_version.unique()):
    if isinstance(e, str):   # filtro los nan
        new_list.add(e.split()[0])
        
new_list

{'Android',
 'BlackBerry',
 'Chrome',
 'Fedora',
 'FreeBSD',
 'Linux',
 'Mac',
 'Other',
 'Symbian',
 'Tizen',
 'Ubuntu',
 'Windows',
 'iOS'}

In [31]:
def proc_os(x):
    if isinstance(x, str):
        return x.lower().split()[0]

In [32]:
events_df.operating_system_version = events_df.operating_system_version.apply(lambda x: proc_os(x) if isinstance(x,str) else x)

## device_type
Se procede analogamente a browser_version

In [33]:
events_df.device_type.unique()

array([nan, 'Smartphone', 'Computer', 'Tablet', 'Unknown'], dtype=object)

Como son campos de una sola palabra, no tiene sentido hacer lo que se hizo con los anteriores casos.

***

## Limited value categorical features
Lets process features which receive a limited number of values.

### event

In [34]:
events_df = events_df.merge(events_df.groupby('person')['event'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('event_').reset_index(),
              on='person', how='left')

In [35]:
events_df.drop('event',axis=1,inplace=True)

In [36]:
init_features.remove('event')

### staticpage

In [37]:
events_df = events_df.merge(events_df.groupby('person')['staticpage'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('staticpage_').reset_index(),
                          on='person', how='left')

In [38]:
events_df.drop('staticpage',axis=1,inplace=True)

In [39]:
init_features.remove('staticpage')

### campaign_source

In [40]:
events_df = events_df.merge(events_df.groupby('person')['campaign_source'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('campaign_source_').reset_index(),
                          on='person', how='left')

In [41]:
events_df.drop('campaign_source',axis=1,inplace=True)

In [42]:
init_features.remove('campaign_source')

### search_engine

In [43]:
events_df = events_df.merge(events_df.groupby('person')['search_engine'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('search_engine_').reset_index(),
                          on='person', how='left')

In [44]:
events_df.drop('search_engine',axis=1,inplace=True)

In [45]:
init_features.remove('search_engine')

### channel

In [46]:
events_df = events_df.merge(events_df.groupby('person')['channel'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('channel_').reset_index(),
                          on='person', how='left')

In [47]:
events_df.drop('channel',axis=1,inplace=True)

In [48]:
init_features.remove('channel')

### new_vs_returning

In [49]:
events_df = events_df.merge(events_df.groupby('person')['new_vs_returning'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('new_vs_returning_').reset_index(),
                          on='person', how='left')

In [50]:
events_df.drop('new_vs_returning',axis=1,inplace=True)

In [51]:
init_features.remove('new_vs_returning')

### device_type

In [52]:
events_df = events_df.merge(events_df.groupby('person')['device_type'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('device_type_').reset_index(),
                          on='person', how='left')

In [53]:
events_df.drop('device_type',axis=1,inplace=True)

In [54]:
init_features.remove('device_type')

### operating_system_version

In [55]:
events_df = events_df.merge(events_df.groupby('person')['operating_system_version'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('operating_system_version_').reset_index(),
                          on='person', how='left')

In [56]:
events_df.drop('operating_system_version',axis=1,inplace=True)

In [57]:
init_features.remove('operating_system_version')

### browser_version

In [58]:
events_df = events_df.merge(events_df.groupby('person')['browser_version'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('browser_version_').reset_index(),
                          on='person', how='left')

In [59]:
events_df.drop('browser_version',axis=1,inplace=True)

In [60]:
init_features.remove('browser_version')

### condition

In [61]:
events_df = events_df.merge(events_df.groupby('person')['condition'].value_counts(normalize=True).unstack(fill_value=0).add_prefix('condition_').reset_index(),
                          on='person', how='left')

In [62]:
events_df.drop('condition',axis=1,inplace=True)

In [63]:
init_features.remove('condition')

In [64]:
events = pd.read_csv('/home/miki_mustard/Desktop/Facultad/Datos/TP2/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv', low_memory=False)

## Generacion de features

In [65]:
features_df = pd.DataFrame()

In [66]:
features_df['person'] = events.person.unique()

In [67]:
features_df.head()

Unnamed: 0,person
0,4886f805
1,ad93850f
2,0297fc1e
3,2d681dd8
4,cccea85e


## Cantidad de eventos totales por usuario

In [68]:
df = events.groupby('person').event.count()

In [69]:
features_df = features_df.join(df, on='person')

In [70]:
features_df.rename(columns={'event':'cant_eventos_totales'}, inplace=True)

## Cantidad de cada evento por usuario

In [71]:
df = events.groupby('person').event.value_counts()

In [72]:
df = df.unstack()
df.head()

event,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0008ed71,,,3.0,,1.0,,,,,,2.0
00091926,15.0,25.0,2.0,,,,,,,372.0,34.0
00091a7a,1.0,5.0,,,,,,,,3.0,1.0
000ba417,1.0,24.0,6.0,1.0,14.0,,1.0,,,153.0,6.0
000c79fe,1.0,,1.0,,1.0,,1.0,9.0,,3.0,1.0


In [73]:
df = df.drop(columns=['lead'])

In [74]:
df['ach*bl'] = (df['ad campaign hit'])*(df['brand listing'])
df['ach*gl'] = (df['ad campaign hit'])*(df['generic listing'])
df['ach*seh'] = (df['ad campaign hit'])*(df['search engine hit'])
df['ach*sp'] = (df['ad campaign hit'])*(df['searched products'])
df['ach*vp'] = (df['ad campaign hit'])*(df['viewed product'])
df['ach*vs'] = (df['ad campaign hit'])*(df['visited site'])

In [75]:
features_df = features_df.join(df, on='person')

In [76]:
features_df.rename(columns={'ad campaign hit':'cant_ach', 'brand listing':'cant_bl', 'checkout':'cant_ch', 
                            'conversion':'cant_cv', 'generic listing':'cant_gl', 'lead':'cant_l',
                           'search engine hit':'cant_seh', 'searched products':'cant_s_p', 
                            'staticpage':'cant_sp', 'viewed product':'cant_vp', 'visited site':'cant_vs'}, inplace=True)

## cantidad de returnings

In [77]:
df = events.groupby('person').new_vs_returning.value_counts().unstack().drop(columns=['New'])

In [78]:
features_df = features_df.join(df, on='person')

## Quincenas

In [79]:
events['timestamp'] = pd.to_datetime(events['timestamp'])
events['mes'] = events['timestamp'].dt.month
events['dia'] = events['timestamp'].dt.day

In [80]:
events['q1'] = ((events['mes']==1) & (events['dia']>1) & (events['dia']<16))
events['q2'] = ((events['mes']==1) & (events['dia']>15) & (events['dia']<32))
events['q3'] = ((events['mes']==2) & (events['dia']>1) & (events['dia']<16))
events['q4'] = ((events['mes']==2) & (events['dia']>15) & (events['dia']<29))
events['q5'] = ((events['mes']==3) & (events['dia']>1) & (events['dia']<16))
events['q6'] = ((events['mes']==3) & (events['dia']>15) & (events['dia']<32))
events['q7'] = ((events['mes']==4) & (events['dia']>1) & (events['dia']<16))
events['q8'] = ((events['mes']==4) & (events['dia']>15) & (events['dia']<31))
events['q9'] = ((events['mes']==5) & (events['dia']>1) & (events['dia']<16))
events['q10'] = ((events['mes']==5) & (events['dia']>15) & (events['dia']<32))

In [81]:
df = events[['person', 'q1','q2','q3','q4','q5','q6','q7','q8','q9','q10']].groupby('person').agg(lambda x: x.any())
df.head()

Unnamed: 0_level_0,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0008ed71,False,False,False,False,False,False,False,False,False,True
00091926,False,False,False,False,False,False,False,False,True,True
00091a7a,False,False,False,False,False,True,False,False,False,False
000ba417,False,False,False,False,False,False,False,False,False,True
000c79fe,False,False,False,False,False,False,False,False,False,True


In [82]:
features_df = features_df.join(df, on='person')

## Tiempo total

In [83]:
events['timestamp'] = pd.to_datetime(events['timestamp'])

In [84]:
df = (events.groupby('person').timestamp.agg('max') - events.groupby('person').timestamp.agg('min'))
df.head()

person
0008ed71    0 days 04:00:50
00091926   27 days 21:43:34
00091a7a    0 days 00:05:47
000ba417    9 days 01:57:37
000c79fe    0 days 00:10:20
Name: timestamp, dtype: timedelta64[ns]

In [85]:
features_df = features_df.join(df, on='person')

In [86]:
features_df = features_df.rename(columns={'timestamp':'tiempo_total'})

In [87]:
features_df[['person', 'tiempo_total']].head()

Unnamed: 0,person,tiempo_total
0,4886f805,0 days 00:23:08
1,ad93850f,7 days 23:00:32
2,0297fc1e,138 days 12:53:38
3,2d681dd8,9 days 19:34:20
4,cccea85e,23 days 16:05:31


In [88]:
features_df.tiempo_total = pd.to_numeric(features_df.tiempo_total, downcast='float')
features_df.tiempo_total.head()

0    1.388000e+12
1    6.876320e+14
2    1.196962e+16
3    8.480600e+14
4    2.045131e+15
Name: tiempo_total, dtype: float32

## Tiempo entre new y returning

In [90]:
df_news = events.loc[events.new_vs_returning == 'New']
df_news[['person', 'timestamp', 'new_vs_returning']].head()

Unnamed: 0,person,timestamp,new_vs_returning
2136660,f35dddc8,2018-04-11 14:46:36,New
2136661,c82ecb12,2018-04-23 14:56:42,New
2136662,102f402d,2018-05-14 15:56:35,New
2136668,1a6498ed,2018-05-15 18:13:22,New
2136669,f11aace6,2018-05-18 14:44:53,New


In [91]:
df_returnings = events.loc[events.new_vs_returning == 'Returning']
df_returnings[['person', 'timestamp', 'new_vs_returning']].head()

Unnamed: 0,person,timestamp,new_vs_returning
2136629,4640420b,2018-05-10 22:34:50,Returning
2136630,4640420b,2018-05-15 02:39:45,Returning
2136631,4640420b,2018-05-18 01:15:26,Returning
2136632,4640420b,2018-05-18 19:03:37,Returning
2136633,4640420b,2018-05-18 19:35:12,Returning


In [92]:
df = (df_returnings.groupby('person').timestamp.min() - df_news.groupby('person').timestamp.min())
df = df.fillna(0)
df.head()

person
0008ed71   0 days 02:36:55
00091926   1 days 05:34:55
00091a7a   0 days 00:00:00
000ba417   0 days 01:45:31
000c79fe   0 days 00:00:00
Name: timestamp, dtype: timedelta64[ns]

In [93]:
features_df = features_df.join(df, on='person')

In [94]:
features_df = features_df.rename(columns={'timestamp':'tiempo_new_returning'})

In [95]:
features_df[['person', 'tiempo_new_returning']].head()

Unnamed: 0,person,tiempo_new_returning
0,4886f805,0 days 00:00:00
1,ad93850f,1 days 02:57:51
2,0297fc1e,3 days 00:00:24
3,2d681dd8,9 days 19:28:03
4,cccea85e,2 days 21:21:19


In [96]:
features_df.tiempo_new_returning = pd.to_numeric(features_df.tiempo_new_returning, downcast='float')
features_df.tiempo_new_returning.head()

0    0.000000e+00
1    9.707100e+13
2    2.592240e+14
3    8.476830e+14
4    2.496790e+14
Name: tiempo_new_returning, dtype: float32

## Agregar a los cosos de Mati

In [97]:
features_df.columns

Index(['person', 'cant_eventos_totales', 'cant_ach', 'cant_bl', 'cant_ch',
       'cant_cv', 'cant_gl', 'cant_seh', 'cant_s_p', 'cant_sp', 'cant_vp',
       'cant_vs', 'ach*bl', 'ach*gl', 'ach*seh', 'ach*sp', 'ach*vp', 'ach*vs',
       'Returning', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9',
       'q10', 'tiempo_total', 'tiempo_new_returning'],
      dtype='object')

In [98]:
del(df)

In [99]:
events_df = events_df.merge(features_df, on='person', how='left')

In [100]:
events_df.columns

Index(['person', 'url', 'sku', 'model', 'storage', 'color', 'skus',
       'search_term', 'city', 'region',
       ...
       'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'tiempo_total',
       'tiempo_new_returning'],
      dtype='object', length=157)

In [101]:
events_df.head()

Unnamed: 0,person,url,sku,model,storage,color,skus,search_term,city,region,...,q3,q4,q5,q6,q7,q8,q9,q10,tiempo_total,tiempo_new_returning
0,4886f805,,9288.0,Samsung Galaxy J7 Prime,32.0,Dourado,,,,,...,False,False,False,False,False,False,False,True,1388000000000.0,0.0
1,ad93850f,,304.0,iPhone 5s,32.0,Cinza espacial,,,,,...,False,False,False,False,False,False,True,True,687632000000000.0,97071000000000.0
2,0297fc1e,,6888.0,iPhone 6S,64.0,Prateado,,,,,...,True,True,True,True,True,True,True,True,1.196962e+16,259224000000000.0
3,2d681dd8,,11890.0,iPhone 7,128.0,Vermelho,,,,,...,False,False,False,False,False,False,False,True,848060000000000.0,847683000000000.0
4,cccea85e,,7517.0,LG G4 H818P,32.0,Branco,,,,,...,False,False,False,False,False,False,True,True,2045131000000000.0,249679000000000.0


***

In [None]:
events_df[['person','model']].groupby('person')[['model']].nunique()['model'].max()

# o sea una misma persona busco como maximo 94 modelos distintos

In [None]:
events_df[['person','color']].groupby('person')[['color']].nunique()['color'].max()

# o sea una misma persona busco como maximo 43 modelos distintos

In [None]:
events_df[['person','search_term']].groupby('person')[['search_term']].nunique()['search_term'].max()

# o sea una misma persona busco mediante como maximo 100 terminos distintos

In [None]:
events_df[['person','city']].groupby('person')[['city']].nunique()['city'].max()

# o sea una misma persona busco desde como maximo 19 ciudades distintas

In [None]:
events_df[['person','region']].groupby('person')[['region']].nunique()['region'].max()

# o sea una misma persona busco desde como maximo 7 regiones distintas

In [None]:
events_df[['person','country']].groupby('person')[['country']].nunique()['country'].max()

# o sea una misma persona busco desde como maximo 3 paises distintos

In [None]:
events_df[['person','url']].groupby('person')[['url']].nunique()['url'].max()

***

In [None]:
events_df.shape

In [None]:
labels_df.shape

In [None]:
events_df.person.nunique()

In [None]:
labels_df.person.nunique()

## Data partition

In [102]:
# armo df con registros completos clasificados
train_df = events_df.merge(labels_df, on='person', how='right')

In [103]:
# armo df con registros a predecir unicamente
to_predict = events_df[~events_df['person'].isin(labels_df.person)].copy()

In [104]:
train_df.shape

(1171886, 158)

In [105]:
to_predict.shape

(1169795, 157)

In [None]:
to_predict.person.nunique()

In [None]:
labels_df.person.nunique()

In [None]:
events_df.person.nunique()

In [106]:
train_df[['person','label']].groupby('person')[['label']].nunique()['label'].unique()

# o sea hay un unico y mismo label para cada persona, que se mantiene igual a lo largo de todos los registros de la persona

array([1])

***

In [None]:
to_predict.person.nunique()

In [None]:
labels_df.person.nunique()

In [107]:
gr1 = train_df[list(train_df.select_dtypes('object').columns)].fillna("").groupby('person',as_index=False).agg(lambda x: ' '.join(x))

In [108]:
gr2 = train_df[['person','sku']].groupby('person',as_index=False).max()

In [109]:
l = list(train_df.select_dtypes(exclude='object').columns)
l.append('person')
l.remove('sku')
gr3 = train_df[l].groupby('person',as_index=False).max()

In [110]:
train_df_final = gr1.merge(gr2.merge(gr3))

In [111]:
gr1 = to_predict[list(to_predict.select_dtypes('object').columns)].fillna("").groupby('person',as_index=False).agg(lambda x: ' '.join(x))

In [112]:
gr2 = to_predict[['person','sku']].groupby('person',as_index=False).max()

In [113]:
l = list(to_predict.select_dtypes(exclude='object').columns)
l.append('person')
l.remove('sku')
gr3 = to_predict[l].groupby('person',as_index=False).max()

In [114]:
to_predict_final = gr1.merge(gr2.merge(gr3))

In [115]:
to_predict_final.shape

(19415, 157)

In [None]:
train_df_final.shape

In [None]:
to_predict_final.shape

In [None]:
train_df_final.shape

# Final export
Uncommenct to save to `.csv`s

In [116]:
train_df_final.to_csv('train_df_processed_screenResol_storage_dates.csv', index=False)
to_predict_final.to_csv('to_predict_processed_screenResol_storage_dates.csv', index=False)

In [117]:
del(train_df_final)
del(to_predict_final)
del(events)
del(events_df)
del(features_df)