In online advertising, click-through rate (CTR) is a very important metric for evaluating ad performance. As a result, click prediction systems are essential and widely used for sponsored search and real-time bidding.

For this competition, we have provided 11 days worth of Avazu data to build and test prediction models. Can you find a strategy that beats standard classification algorithms? The winning models from this competition will be released under an open-source license.

Avazu: https://www.kaggle.com/c/avazu-ctr-prediction

File descriptions

    train - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies.
    test - Test set. 1 day of ads to for testing your model predictions. 
    sampleSubmission.csv - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.

Data fields

    id: ad identifier
    click: 0/1 for non-click/click
    hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
    C1 -- anonymized categorical variable
    banner_pos
    site_id
    site_domain
    site_category
    app_id
    app_domain
    app_category
    device_id
    device_ip
    device_model
    device_type
    device_conn_type
    C14-C21 -- anonymized categorical variables


In [1]:
from IPython.display import HTML
from IPython.display import Image

import pandas as pd
import numpy as np
#import xlearn as xl
#logistic regression l1 regularization
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model


In [2]:
location='./data/'
converters = {"site_id": lambda x: int(x, 16),
              "site_domain": lambda x: int(x, 16),
              "site_category": lambda x: int(x, 16),
              "app_id": lambda x: int(x, 16),
              "app_domain": lambda x: int(x, 16),
              "app_category": lambda x: int(x, 16),
              "device_id": lambda x: int(x, 16),
              "device_model": lambda x: int(x, 16),
              "device_type": lambda x: int(x, 16),
              "device_ip": lambda x: int(x, 16),
             }
#Import only the first 10000 rows
data=pd.read_csv(location+'train.csv', nrows=500000, converters=converters) 


In [4]:
data.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1000009418151094273,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,2,15706,320,50,1722,0,35,-1,79
1,10000169349117863715,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,0,15704,320,50,1722,0,35,100084,79
2,10000371904215119486,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,0,15704,320,50,1722,0,35,100084,79
3,10000640724480838376,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,0,15706,320,50,1722,0,35,100084,79
4,10000679056417042096,0,14102100,1005,1,4270638152,2439430497,90831144,3970769798,2013391065,...,1,0,18993,320,50,2161,0,35,-1,157


In [5]:
print("unique values:")
for colname in data.columns.values:
    print('\t',colname,'=',data[colname].unique().shape[0])

unique values:
	 id = 50000
	 click = 2
	 hour = 1
	 C1 = 6
	 banner_pos = 5
	 site_id = 693
	 site_domain = 593
	 site_category = 16
	 app_id = 571
	 app_domain = 42
	 app_category = 18
	 device_id = 4255
	 device_ip = 25344
	 device_model = 2062
	 device_type = 4
	 device_conn_type = 4
	 C14 = 374
	 C15 = 5
	 C16 = 6
	 C17 = 126
	 C18 = 4
	 C19 = 37
	 C20 = 132
	 C21 = 29


In [43]:
print("col values:")
for colname in data.columns.values:
    print('\t',colname,'=',data[colname][0], ' type=',type(data[colname][0]))

col values:
	 id = 1.0000094181510943e+18  type= <class 'numpy.float64'>
	 click = 0  type= <class 'numpy.int64'>
	 hour = 14102100  type= <class 'numpy.int64'>
	 C1 = 1005  type= <class 'numpy.int64'>
	 banner_pos = 0  type= <class 'numpy.int64'>
	 site_id = 532546046  type= <class 'numpy.int64'>
	 site_domain = 4085536615  type= <class 'numpy.int64'>
	 site_category = 680550077  type= <class 'numpy.int64'>
	 app_id = 3970769798  type= <class 'numpy.int64'>
	 app_domain = 2013391065  type= <class 'numpy.int64'>
	 app_category = 131587874  type= <class 'numpy.int64'>
	 device_id = 2845778250  type= <class 'numpy.int64'>
	 device_ip = 3721564782  type= <class 'numpy.int64'>
	 device_model = 1150642724  type= <class 'numpy.int64'>
	 device_type = 1  type= <class 'numpy.int64'>
	 device_conn_type = 2  type= <class 'numpy.int64'>
	 C14 = 15706  type= <class 'numpy.int64'>
	 C15 = 320  type= <class 'numpy.int64'>
	 C16 = 50  type= <class 'numpy.int64'>
	 C17 = 1722  type= <class 'numpy.int6

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 24 columns):
id                  500000 non-null float64
click               500000 non-null int64
hour                500000 non-null int64
C1                  500000 non-null int64
banner_pos          500000 non-null int64
site_id             500000 non-null int64
site_domain         500000 non-null int64
site_category       500000 non-null int64
app_id              500000 non-null int64
app_domain          500000 non-null int64
app_category        500000 non-null int64
device_id           500000 non-null int64
device_ip           500000 non-null int64
device_model        500000 non-null int64
device_type         500000 non-null int64
device_conn_type    500000 non-null int64
C14                 500000 non-null int64
C15                 500000 non-null int64
C16                 500000 non-null int64
C17                 500000 non-null int64
C18                 500000 non-null int64
C19  

In [3]:
# y=pd.DataFrame(data['click']).copy()
y=data['click'].copy()

In [4]:
print(y.shape[0],int(y.sum()),float(y.sum())/y.shape[0])

500000 82037 0.164074


In [5]:
X=data[data.columns.values[2:]].copy()
X.shape

(500000, 22)

In [7]:
# from the larger dataset, subsample nsamps click records and numsamps no-click records

nsamps = 10000

y0 = y[y==0]
X0 = X[y==0]
y1 = y[y==1]
X1 = X[y==1]

print(y0.shape,X0.shape,y1.shape,X1.shape)

y_eq = y1[:nsamps].append(y0[:nsamps], ignore_index=True)
X_eq = X1[:nsamps].append(X0[:nsamps], ignore_index=True)

print(y_eq.shape, X_eq.shape)

(417963,) (417963, 22) (82037,) (82037, 22)
(20000,) (20000, 22)


In [8]:
# X['click']=1

In [9]:
#X.rename(index=str, columns={'click': 'intercept'},inplace=True)

In [10]:
# colNames=list(X.columns)
# m,n = X.shape
# X = np.array(X)
# y = np.array(y)
# theta = np.array(np.zeros(n).reshape(n,1))

In [11]:
from sklearn import linear_model

def regLogitRegress(X,y):
    model = linear_model.LogisticRegression(penalty='l1', C=1.0, verbose=True)
    model.fit(X, y.ravel())
    Accuracy=model.score(X, y)
    print('The accuracy of the model:',Accuracy)
    coeff_df = pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))
    Y_mean=y.mean()
    print('Target average:',Y_mean)
    return coeff_df

In [None]:
regLogitRegress(X,y)

[LibLinear]