In [38]:
import numpy as np 
import pandas as pd
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import FeatureHasher

In [2]:
data = pd.read_csv("avazu_train_subset.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2021449 entries, 0 to 2021448
Data columns (total 24 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   Unnamed: 0        int64 
 1   hour              int64 
 2   C1                int64 
 3   banner_pos        int64 
 4   site_id           object
 5   site_domain       object
 6   site_category     object
 7   app_id            object
 8   app_domain        object
 9   app_category      object
 10  device_id         object
 11  device_ip         object
 12  device_model      object
 13  device_type       int64 
 14  device_conn_type  int64 
 15  C14               int64 
 16  C15               int64 
 17  C16               int64 
 18  C17               int64 
 19  C18               int64 
 20  C19               int64 
 21  C20               int64 
 22  C21               int64 
 23  click             int64 
dtypes: int64(15), object(9)
memory usage: 370.1+ MB


In [4]:
types = {'Unnamed: 0': np.uint32, 'click': np.uint8, 'hour': np.uint32, 'C1': np.uint32,
         'banner_pos': np.uint32, 'site_id': 'category', 'site_domain': 'category', 
         'site_category': 'category', 'app_id': 'category', 'app_domain': 'category',
         'app_category': 'category', 'device_id': 'category', 'device_ip': 'category',
         'device_model': 'category', 'device_type': np.uint8, 'device_conn_type': np.uint8,
         'C14': np.uint16, 'C15': np.uint16, 'C16': np.uint16, 'C17': np.uint16,
         'C18': np.uint16, 'C19': np.uint16, 'C20': np.uint16, 'C21': np.uint16}

In [5]:
# For reduced memory usage
data = pd.read_csv("avazu_train_subset.csv", usecols=types.keys(), dtype=types)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2021449 entries, 0 to 2021448
Data columns (total 24 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Unnamed: 0        uint32  
 1   hour              uint32  
 2   C1                uint32  
 3   banner_pos        uint32  
 4   site_id           category
 5   site_domain       category
 6   site_category     category
 7   app_id            category
 8   app_domain        category
 9   app_category      category
 10  device_id         category
 11  device_ip         category
 12  device_model      category
 13  device_type       uint8   
 14  device_conn_type  uint8   
 15  C14               uint16  
 16  C15               uint16  
 17  C16               uint16  
 18  C17               uint16  
 19  C18               uint16  
 20  C19               uint16  
 21  C20               uint16  
 22  C21               uint16  
 23  click             uint8   
dtypes: category(9), uint16(8), uint32(4), uint8(3)
mem

In [7]:
data.drop(columns =['Unnamed: 0'], inplace =True)

In [14]:
train, test = train_test_split(data, train_size=0.8)

##### Freq Encoding

In [15]:
freq_counts = train.groupby(['site_category']).size()

train["site_category_freq"] = train["site_category"].map(freq_counts)


#### Mean encoding

Using the mean of the response variable per category to encode that response variable

In [26]:
train['C1'].value_counts()

1005    1485620
1002      88654
1010      36207
1012       4633
1007       1426
1001        392
1008        227
Name: C1, dtype: int64

In [29]:
train['device_type'].value_counts()

1    1492296
0      88654
4      31003
5       5204
2          2
Name: device_type, dtype: int64

In [32]:
m = train["C1"].groupby(train["device_type"]).mean()

train["c1_mean_enc"] = train["device_type"].map(m)

##### Hash encoding and label encoding

Label encoding: categorical values are replaced by numeric integers

In [20]:
def label_encoding_cols(train, test, col):
    train[col] = train[col].astype('category').cat.as_ordered()
    encoder = train[col].cat.categories
    train[col] = train[col].cat.codes + 1
    test[col] = pd.Categorical(test[col], categories=encoder, ordered=True)
    test[col] = test[col].cat.codes + 1

In [21]:
def label_encoding_with_UNK(train, val, col, UNK=True):
    """ Label encoding handling "UNK" values too
    """
    le = LabelEncoder()
    uniq = np.unique(train[col])
    if UNK:
        uniq = np.concatenate((np.array(["UNK"]),uniq))
    le.fit(uniq)
    train[col] = le.transform(train[col])
    val_col = [x if x in le.classes_ else 'UNK' for x in val[col]]
    val[col] = le.transform(val_col)

In [34]:
train.app_domain.unique()

['b9528b13', 'b5f3b24a', '7801e8d9', '5c620f04', '5c5a694b', ..., 'a5f9dc5b', '9299777a', '7a0640b2', 'fd0f197b', '52c29fe1']
Length: 247
Categories (265, object): ['0654b444', '0b7d3d7d', '0d79ee56', '0e8616ad', ..., 'dcb74110', 'a5f9dc5b', '11c6546c', '6bfb9168']

In [36]:
le = label_encoding_with_UNK(train, test, 'app_domain')

Hashing: Category is mapped into a vector of d dimensions

In [39]:
def hashing_trick(col, n_features=3):
    name = col.name
    col_names = [name + "_" + str(i+1) for i in range(n_features)]
    h = FeatureHasher(input_type='string', n_features=n_features)
    out = h.transform(col).toarray()
    return pd.DataFrame(out, columns=col_names)

In [41]:
train.device_model.unique()

['3657b706', '9e3836ff', '698a4073', '6360f9ec', '7ac6007f', ..., '7a797e9f', 'b6752cb9', 'be8891d0', '34a16602', '4c26e918']
Length: 5606
Categories (5804, object): ['00097428', '0009f4d7', '008ac803', '00b08597', ..., '9a44e584', '9a7e2cd3', 'd9117b40', 'e298e9e8']

In [42]:
hashing_trick(train['device_model'][:7], n_features=10)

Unnamed: 0,device_model_1,device_model_2,device_model_3,device_model_4,device_model_5,device_model_6,device_model_7,device_model_8,device_model_9,device_model_10
0,4.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-1.0
1,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3.0,1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,-1.0
3,3.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0
4,2.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,2.0,-2.0
5,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
6,3.0,1.0,0.0,-1.0,0.0,0.0,0.0,-2.0,0.0,1.0
