In [29]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression

import os
from scipy.sparse import csr_matrix, hstack

#reading input data
models = pd.read_csv('phone_brand_device_model.csv', index_col='device_id');
models = models.reset_index()
models = models.drop_duplicates('device_id',take_last=True).set_index('device_id')
train = pd.read_csv('gender_age_train.csv', index_col='device_id');
test = pd.read_csv('gender_age_test.csv', index_col='device_id');

events = pd.read_csv('events.csv',parse_dates=['timestamp'], index_col='event_id');
app_events = pd.read_csv('app_events.csv', 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool});
applabels = pd.read_csv('app_labels.csv');

names = pd.read_csv('sample_submission.csv', index_col='device_id');

In [30]:
train['trainrow'] = np.arange(train.shape[0])
test['testrow'] = np.arange(test.shape[0])

#mapping device model and phone brand data and adding to features
encoder = LabelEncoder().fit(models.device_model)
models['device_model'] = encoder.transform(models['device_model'])
encoder = LabelEncoder().fit(models.phone_brand)
models['phone_brand'] = encoder.transform(models['phone_brand'])

train['device_model'] = models['device_model']
test['device_model'] = models['device_model']
train['phone_brand'] = models['phone_brand']
test['phone_brand'] = models['phone_brand']

#mapping/saving target column
encoderg = LabelEncoder().fit(names.columns)
train['group'] = encoderg.transform(train['group'])

In [31]:
#mapping applications
appencoder = LabelEncoder().fit(app_events.app_id)
app_events['app'] = appencoder.transform(app_events.app_id)
napps = len(appencoder.classes_)
napps

19237

In [32]:
#mid contains for each device row indices where it appears
mid = test[['testrow']].merge(train[['trainrow']], how='outer', left_index=True, right_index=True)

In [33]:
#tying up applications with devices
deviceapps = app_events.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
deviceapps = deviceapps.groupby(['device_id','app'])['app'].agg(['size'])
deviceapps = deviceapps.merge(mid, how='left', left_index=True, right_index=True)
deviceapps = deviceapps.reset_index()
deviceapps

Unnamed: 0,device_id,app,size,testrow,trainrow
0,-9222956879900151005,548,18,,21594
1,-9222956879900151005,1096,18,,21594
2,-9222956879900151005,1248,26,,21594
3,-9222956879900151005,1545,12,,21594
4,-9222956879900151005,1664,18,,21594
5,-9222956879900151005,1848,32,,21594
6,-9222956879900151005,2236,28,,21594
7,-9222956879900151005,2350,28,,21594
8,-9222956879900151005,2626,27,,21594
9,-9222956879900151005,3384,30,,21594


In [34]:
#tying up applacation labels with devices
applabels = applabels.loc[applabels.app_id.isin(app_events.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

devicelabels = deviceapps[['device_id','app']].merge(applabels[['app','label']])
devicelabels = devicelabels.groupby(['device_id','label'])['label'].agg(['size'])
devicelabels = devicelabels.merge(mid, how='left', left_index=True, right_index=True)
devicelabels = devicelabels.reset_index()
devicelabels

Unnamed: 0,device_id,label,size,testrow,trainrow
0,-9222956879900151005,117,1,,21594
1,-9222956879900151005,120,1,,21594
2,-9222956879900151005,126,1,,21594
3,-9222956879900151005,138,2,,21594
4,-9222956879900151005,147,2,,21594
5,-9222956879900151005,170,1,,21594
6,-9222956879900151005,181,1,,21594
7,-9222956879900151005,190,1,,21594
8,-9222956879900151005,207,8,,21594
9,-9222956879900151005,208,11,,21594


In [35]:
#tying up active hours with devices
tm = events['timestamp']
hours = [g.hour for g in tm]
events['hour'] = hours
devicehours = events.merge(mid, how='left', left_on = 'device_id', right_index=True)
devicehours

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude,hour,testrow,trainrow
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,0,,58469
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,0,68691,
3,-4833982096941402721,2016-05-01 00:08:05,106.60,29.70,0,,7337
4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,0,,9287
5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,0,,41396
6,1476664663289716375,2016-05-01 00:27:21,0.00,0.00,0,,51154
7,5990807147117726237,2016-05-01 00:15:13,113.73,23.00,0,101850,
8,1782450055857303792,2016-05-01 00:15:35,113.94,34.70,0,65894,
9,-2073340001552902943,2016-05-01 00:15:33,0.00,0.00,0,26034,
10,-8195816569128397698,2016-05-01 00:41:31,119.34,26.04,0,,73853


In [36]:
#making up matrix [devices , model] and [devices , brand]
#each row has a single 1 in place corresponding to used model or brand
Xtrain_model = csr_matrix((np.ones(train.shape[0]), 
                       (train.trainrow, train.device_model)))
Xtest_model = csr_matrix((np.ones(test.shape[0]), 
                       (test.testrow, test.device_model)))

Xtrain_brand = csr_matrix((np.ones(train.shape[0]), 
                       (train.trainrow, train.phone_brand)))
Xtest_brand = csr_matrix((np.ones(test.shape[0]), 
                       (test.testrow, test.phone_brand)))

In [37]:
#making up matrix [devices , active hours]
d = devicehours.dropna(subset = ['trainrow'])
Xtrain_hour = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.hour)), shape = (train.shape[0], 24))

d = devicehours.dropna(subset = ['testrow'])
Xtest_hour = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.hour)), shape = (test.shape[0], 24))

In [38]:
#making up matrix [devices , application labels]
d = devicelabels.dropna(subset=['trainrow'])
Xtrain_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(train.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xtest_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(test.shape[0],nlabels))

In [39]:
#making up matrix [devices , installed aps]
d = deviceapps.dropna(subset=['trainrow'])
Xtrain_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(train.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xtest_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(test.shape[0],napps))


#consider only applications with at least 100 total intalations to lower number of meaningless features
large_C = 100

sums = Xtrain_app.sum(axis = 0)
leave = [i for i in range (0, sums.shape[1]) if (sums[0,i] >= large_C).any()]

Xtrain_app = Xtrain_app[:, leave]
Xtest_app = Xtest_app[:, leave]
len(leave)

1205

In [40]:
#concstensting all features
Xtrain = hstack((Xtrain_model, Xtrain_brand, Xtrain_label, Xtrain_app, Xtrain_hour), format='csr')
Xtest = hstack((Xtest_model, Xtest_brand, Xtest_label, Xtest_app, Xtest_hour), format='csr')

Y = train['group'].values

print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (74645, 3451), test shape (112071, 3451)


In [41]:
#evaluation of model
def score(clf):
    sf = StratifiedKFold(Y, n_folds=5)
    aver = 0
    for itrain, itest in sf:
        clf.fit(Xtrain.toarray()[itrain, :], Y[itrain])
        prob = clf.predict_proba(Xtrain.toarray()[itest, :]);
        loss = log_loss(Y[itest], prob)
        aver = aver + loss / len(sf)
    return aver

In [42]:
#searching for optimal parameters for regression model
for i in range (1, 10) :
    print(i * 0.01 , ': ', score(LogisticRegression(C = i * 0.01)))
#TODO - try to change other options

(0.01, ': ', 2.2939929481522028)
(0.02, ': ', 2.2907897854768424)
(0.03, ': ', 2.2920577455867051)
(0.04, ': ', 2.2944693481348621)


KeyboardInterrupt: 

In [43]:
#saving up the answer
clf = LogisticRegression(C = 0.02)
clf = clf.fit(Xtrain.toarray(), train['group'])
prob = clf.predict_proba(Xtest.toarray())
ans = pd.DataFrame(prob, index = test.index, columns = names.columns)
ans.to_csv('my.csv',index=True)