### Dealing with imports...

In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

### Loading data...

In [2]:
events_df = pd.read_csv('data/events_up_to_01062018.csv', low_memory=False)
labels_df = pd.read_csv('data/labels_training_set.csv', low_memory=False)

In [3]:
# armo df con registros completos clasificados
train_df = events_df.merge(labels_df, on='person', how='right')

In [4]:
# armo df con registros a predecir unicamente
to_predict = events_df[~events_df['person'].isin(labels_df.person)].copy()

In [5]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
date_proc(train_df)
date_proc(to_predict)

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [8]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(train_df.weekday)

train_df.weekday = weekday_le.transform(train_df.weekday)
to_predict.weekday = weekday_le.transform(to_predict.weekday)

In [9]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [10]:
# cell to compare results before & after processing
# train_df[['year','month','day','weekday','hour']].head()

In [11]:
month_to_cyclic(train_df)
day_to_cyclic(train_df)
weekday_to_cyclic(train_df)
hour_to_cyclic(train_df)

month_to_cyclic(to_predict)
day_to_cyclic(to_predict)
weekday_to_cyclic(to_predict)
hour_to_cyclic(to_predict)

In [12]:
# train_df[['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']].head()

In [13]:
# train_df.drop('year_month_day', axis=1, inplace=True)
# to_predict.drop('year_month_day', axis=1, inplace=True)

## screen_resolution

In [14]:
def get_screen_width(x):
    if x != "":
        return int(x.split("x")[0])
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return int(x.split("x")[1])
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [15]:
process_screen_res(train_df)
process_screen_res(to_predict)

## storage

In [16]:
def process_storage_string(x):
    if pd.isna(x):
        return 0
    s = x.split("GB")
    if len(s) == 2:
        # case data in GB
        return int(s[0])
    else:
        # case data in MB
        return int(x.split("MB")[0])/1024

def storage_process(df):
    df.storage = df.storage.apply(lambda x: process_storage_string(x))

In [17]:
storage_process(train_df)
storage_process(to_predict)

In [18]:
train_df.to_csv('data/train_df_processed_screenResol_storage_dates.csv', index=False)
to_predict.to_csv('data/to_predict_processed_screenResol_storage_dates.csv', index=False)