### Dealing with imports...

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

### Loading data...

In [2]:
events_df = pd.read_csv('data/events_up_to_01062018.csv', low_memory=False)
labels_df = pd.read_csv('data/labels_training_set.csv', low_memory=False)

In [3]:
# armo df con registros completos clasificados
train_df = events_df.merge(labels_df, on='person', how='right')

In [4]:
# armo df con registros a predecir unicamente
to_predict = events_df[~events_df['person'].isin(labels_df.person)].copy()

In [5]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
date_proc(train_df)
date_proc(to_predict)

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [6]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(train_df.weekday)

train_df.weekday = weekday_le.transform(train_df.weekday)
to_predict.weekday = weekday_le.transform(to_predict.weekday)

In [7]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [8]:
# cell to compare results before & after processing
# train_df[['year','month','day','weekday','hour']].head()

In [9]:
month_to_cyclic(train_df)
day_to_cyclic(train_df)
weekday_to_cyclic(train_df)
hour_to_cyclic(train_df)

month_to_cyclic(to_predict)
day_to_cyclic(to_predict)
weekday_to_cyclic(to_predict)
hour_to_cyclic(to_predict)

In [10]:
# train_df[['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']].head()

In [11]:
# train_df.drop('year_month_day', axis=1, inplace=True)
# to_predict.drop('year_month_day', axis=1, inplace=True)

## screen_resolution

In [12]:
def get_screen_width(x):
    if x != "":
        return int(x.split("x")[0])
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return int(x.split("x")[1])
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [13]:
process_screen_res(train_df)
process_screen_res(to_predict)

## storage

In [14]:
def process_storage_string(x):
    if pd.isna(x):
        return 0
    s = x.split("GB")
    if len(s) == 2:
        # case data in GB
        return int(s[0])
    else:
        # case data in MB
        return int(x.split("MB")[0])/1024

def storage_process(df):
    df.storage = df.storage.apply(lambda x: process_storage_string(x))

In [15]:
storage_process(train_df)
storage_process(to_predict)

## Browser

In [16]:
train_df.browser_version.unique()

array([nan, 'Chrome Mobile 66.0', 'Chrome 66.0', 'Chrome Mobile 65.0',
       'Chrome 65.0', 'Chrome Mobile 63.0', 'Chrome 67.0', 'Safari 11.1',
       'Chrome 64.0', 'Chrome Mobile 44.0', 'Opera 52.0', 'Opera 51.0',
       'Opera 53.0', 'Mobile Safari 10', 'IE Mobile 11',
       'Samsung Internet 6.4', 'Chrome Mobile 64.0',
       'Samsung Internet 3.3', 'Chrome 63.0', 'Chrome Mobile 40.0',
       'Chrome 22.0', 'Chrome Mobile iOS 47.0', 'Chrome Mobile 45.0',
       'Chrome Mobile 60.0', 'Chrome Mobile 58.0', 'Facebook 91',
       'Facebook 94', 'Chrome Mobile 39', 'Android 4.0', 'Edge 14.14393',
       'Edge 17.17134', 'Facebook 165', 'Facebook 173',
       'Chrome Mobile 56.0', 'Chrome Mobile iOS 59.0',
       'Samsung Internet 2', 'Mobile Safari 11', 'Android 4.1',
       'Chrome 44.0', 'Chrome Mobile 55.0', 'Chrome 49.0',
       'Chrome Mobile 43.0', 'Firefox 60', 'Chrome 58.0', 'Edge 16.16299',
       'Facebook 171', 'Facebook 172', 'Chrome Mobile 53.0',
       'Edge 12.10240', '

In [17]:
# veamos valores unicos si cortamos el primer string nomas
new_list=set()
for e in list(train_df.browser_version.unique()):
    if isinstance(e, str):
        new_list.add(e.split()[0])
        
new_list

{'Android',
 'BlackBerry',
 'Chrome',
 'Chromium',
 'Edge',
 'Facebook',
 'Firefox',
 'IE',
 'K-Meleon',
 'Maxthon',
 'Mobile',
 'Opera',
 'Other',
 'Pinterest',
 'Puffin',
 'Safari',
 'Samsung',
 'UC',
 'Vivaldi',
 'WebKit',
 'Yandex'}

Que seran...?
* Android (no es un browser en si)
* K-Meleon
* Maxthon
* Mobile
* Pinterest
* Puffin
* Samsung
* UC
* WebKit
* Yandex

Se confirma que son browsers los siguientes:
* Android (no es un browser en si)
* K-Meleon (http://kmeleonbrowser.org/)
* Maxthon (http://www.maxthon.com/)
* Mobile
* Pinterest
* Puffin (Puffin Browser is a web browser released by CloudMosa for mobile operating systems Android, iOS, Windows and Android TV.)
* Samsung (Samsung Internet for Android is a mobile web browser for smartphones and tablets developed by Samsung. It is based on the open-source Chromium project. It is pre-installed on Samsung Galaxy devices.
* UC (UC Browser is a web browser developed by the Chinese mobile Internet company UCWeb, which is in turn owned by the Alibaba Group. As of August 2018 it is the third most popular mobile browser in the world by market share, after Google Chrome and Safari.)
* WebKit (https://webkit.org/)
* Yandex: un browser (https://browser.yandex.com/)

Nos queda revisar los siguientes:
* Android (no es un browser en si)
* Mobile
* Pinterest

In [18]:
for e in list(train_df.browser_version.unique()):
    if isinstance(e,str):
        if e.lower().split()[0] in ['android','mobile','pinterest']:
            print(e)

Mobile Safari 10
Android 4.0
Mobile Safari 11
Android 4.1
Mobile Safari 7
Android 5.1
Mobile Safari 9
Mobile Safari 8
Android 4.4
Android 4.3
Mobile Safari 11.2
Mobile Safari UI/WKWebView 9.3
Mobile Safari 9.3
Android 2.3
Android 4.2
Mobile Safari 11.1
Mobile Safari UI/WKWebView 10.3
Mobile Safari UI/WKWebView 10.2
Mobile Safari 10.3
Mobile Safari 10.1
Mobile Safari 4.0
Pinterest
Mobile Safari 7.1
Mobile Safari 11.0
Mobile Safari 7.0
Android 3.2
Mobile Safari 5.1
Mobile Safari UI/WKWebView 11.3
Mobile Safari 6


***
O sea que todos los _Mobile_ son `Safari`, todos los _Android_ son meramente eso, Android, y _Pinterest_ aparentemente es un browser tambien (?.

Conclusion: puede quedarse solamente el pirmer string del campo para representar el browser, ya que a nadie le interesa la version del mismo.
***

In [19]:
def proc_browser(x):
    if isinstance(x, str):
        return x.lower().split()[0]

In [20]:
train_df.browser_version = train_df.browser_version.apply(lambda x: proc_browser(x) if isinstance(x,str) else x)
to_predict.browser_version = to_predict.browser_version.apply(lambda x: proc_browser(x) if isinstance(x,str) else x)

## operating_system_version
Se procede analogamente a browser_version

In [21]:
train_df.operating_system_version.unique()

array([nan, 'Android 5.1.1', 'Android 5.1', 'Windows 10 ', 'Android 7',
       'Windows 7 ', 'Android 4.1.2', 'Mac OS X 10.13.4', 'Android 6',
       'Android 5.0.2', 'iOS 10.3.3', 'Windows Phone 8.1',
       'Android 4.4.4', 'Android 6.0.1', 'Android 7.1.1', 'Android 4.4.2',
       'Linux ', 'iOS 7.1.2', 'Windows 8.1 ', 'Android 4.0.4',
       'Android 5', 'Android 4.2.2', 'Mac OS X 10.11.6', 'iOS 11.3',
       'iOS 11.2.6', 'Windows XP ', 'Ubuntu ', 'Android 4.3',
       'Windows 8 ', 'Mac OS X 10.13.3', 'Android 8', 'iOS 10.2.1',
       'iOS 11.2.1', 'Android 4.1.1', 'iOS 10.3.2', 'Windows Phone 10',
       'iOS 9.3.5', 'iOS 8.1.3', 'iOS 10.2', 'iOS 11.1.1', 'iOS 11.2.2',
       'iOS 11.2.5', 'Other ', 'Android 7.1.2', 'Android 5.0.1',
       'iOS 11.4', 'iOS 11.0.2', 'iOS 8.1.1', 'iOS 11.1',
       'Chrome OS 10452.74', 'Chrome OS 10452.96', 'Chrome OS 10323.67',
       'Mac OS X 10.13', 'iOS 11.1.2', 'Android 8.1', 'Mac OS X 10.7.5',
       'Windows Vista ', 'Mac OS X 10.12.5', 'M

In [22]:
# veamos valores unicos si cortamos el primer string nomas
new_list=set()
for e in list(train_df.operating_system_version.unique()):
    if isinstance(e, str):   # filtro los nan
        new_list.add(e.split()[0])
        
new_list

{'Android',
 'BlackBerry',
 'Chrome',
 'FreeBSD',
 'Linux',
 'Mac',
 'Other',
 'Tizen',
 'Ubuntu',
 'Windows',
 'iOS'}

In [23]:
def proc_os(x):
    if isinstance(x, str):
        return x.lower().split()[0]

In [24]:
train_df.operating_system_version = train_df.operating_system_version.apply(lambda x: proc_os(x) if isinstance(x,str) else x)
to_predict.operating_system_version = to_predict.operating_system_version.apply(lambda x: proc_os(x) if isinstance(x,str) else x)

## device_type
Se procede analogamente a browser_version

In [28]:
train_df.device_type.unique()

array([nan, 'Smartphone', 'Computer', 'Tablet', 'Unknown'], dtype=object)

Como son campos de una sola palabra, no tiene sentido hacer lo que se hizo con los anteriores casos.

# Final export
Uncommenct to save to `.csv`s

In [55]:
train_df.to_csv('data/train_df_processed_screenResol_storage_dates.csv', index=False)
to_predict.to_csv('data/to_predict_processed_screenResol_storage_dates.csv', index=False)