In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import random

from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA

### Loading the Data

In [2]:
dtype = {
    'id': np.str, 'click': np.bool, 'hour': np.str,
    'C1': np.uint16, 'banner_pos': np.uint16,
    'site_id': np.str, 'site_domain': np.str, 'site_category': np.str,
    'app_id': np.str, 'app_domain': np.str, 'app_category': np.str,
    'device_id': np.str, 'device_ip': np.str, 'device_model': np.str, 'device_type': np.uint16, 
    'device_conn_type': np.uint16,
    'C14': np.uint16, 'C15': np.uint16, 'C16': np.uint16, 'C17': np.uint16, 
    'C18': np.uint16, 'C19': np.uint16, 'C20': np.uint32, 'C21': np.uint16    
}

In [3]:
%%time

random.seed(10)


row_count = 40428968 #row count taken without opening the file,  wc -l train.csv

row_limit = 250000 #limit the number of rows

skip_first = 30000000

#selecting the rows to be skipped
skip = [i for i in xrange(skip_first)] + sorted(random.sample(xrange(skip_first, row_count), row_count - (skip_first + row_limit)))  

skip.remove(0)



df = pd.read_csv('data/train.csv', 
                   delimiter = ',', 
                   skiprows = skip, 
                   dtype = dtype)


Wall time: 1min 29s


### Handling the Class Imbalance Problem through downsampling

In [5]:
df = pd.concat([df[df.click == True], df[df.click == False].sample(df[df.click == True].shape[0])])
df = df.sort_values('hour', ascending = True)

In [8]:
print "New sample size: " + str(df.shape[0])

New sample size: 79482


In [9]:
np.save('data/labels', df['click'].values) #saving lavels to the disk. df will be re-used to save the memory.

### Understanding the importance of the hidden columns by feature ranking through RFE and ETC

In [10]:
c_cols = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

### Recursive Feature Elimination
- Recursively remove each features builds a model using the remaining features

In [11]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(df[c_cols], df['click'])
print("Feature Ranking: %s") % zip(c_cols,fit.ranking_)

Feature Ranking: [('C1', 2), ('C14', 1), ('C15', 5), ('C16', 6), ('C17', 1), ('C18', 7), ('C19', 3), ('C20', 1), ('C21', 4)]


### Extra Trees Classifier
- Light-weight random forest used for getting the Information Gain

In [12]:
model = ExtraTreesClassifier()
model.fit(df[c_cols], df['click'])
print("Feature Importance: %s") % zip(c_cols,model.feature_importances_)

Feature Importance: [('C1', 0.037572626869539764), ('C14', 0.21028507596634261), ('C15', 0.01448254700272102), ('C16', 0.062991921662398725), ('C17', 0.098639002812250715), ('C18', 0.19876921205504505), ('C19', 0.10077944509939389), ('C20', 0.16427790000997006), ('C21', 0.11220226852233814)]


### Selecting the Column Subset

In [15]:
selected_columns = ['banner_pos', 'site_category', 'app_category', 'site_domain', 'app_domain', 
                    'device_model', 'device_type', 'device_conn_type', 'C14', 'C19', 'C20']

### One Hot Encoding of the categorical features

In [16]:
def one_hot_encoding(df, features_columns):
    df = pd.get_dummies(df, columns=features_columns, sparse=True)
    return df.values # Changing to numpy to save memory

In [17]:
%%time
df = one_hot_encoding(df[selected_columns], selected_columns)

Wall time: 38 s


In [None]:
print "Dimensions: " + str(df.shape)

Dimensions: (79482L, 5308L)


### Principle Component Analysis to reduce the dimensions

In [None]:
pca = PCA(n_components= 300)
df = pca.fit(df.T)
np.save('data/features_pca', df.components_) #saving features to the disk