In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.svm import LinearSVC
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack
import time
'''from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.optimizers import SGD'''
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn.cross_validation import StratifiedKFold,KFold
seed = 7
np.random.seed(seed)

In [5]:
events = pd.read_csv('input/events.csv',index_col='event_id')
label = pd.read_csv('input/label_categories.csv')
app_event = pd.read_csv('input/app_events.csv')
app_label = pd.read_csv('input/app_labels.csv')
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [6]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group
device_id                               
-8076087639492063270      M   35  M32-38

gender_age_test
 Empty DataFrame
Columns: []
Index: [1002079943728939269]

device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米

events
                   device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Separation of two set of devices: with events and without events

In [7]:
gender_age_train_with = gender_age_train.loc[gender_age_train.index.isin(events.device_id.unique())].copy()
gender_age_train_without = gender_age_train.loc[~gender_age_train.index.isin(events.device_id.unique())].copy()
gender_age_test_with = gender_age_test.loc[gender_age_test.index.isin(events.device_id.unique())].copy()
gender_age_test_without = gender_age_test.loc[~gender_age_test.index.isin(events.device_id.unique())].copy()
print('{0:<40.40}{1:5}'.format('Size of training set without events:',gender_age_train_without.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of training set with events:',gender_age_train_with.shape[0]))
print('{0:<40.40}{1:5}'.format('Total size of training set:',gender_age_train.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of testing set without events:',gender_age_test_without.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of training set with events:',gender_age_test_with.shape[0]))
print('{0:<40.40}{1:5}'.format('Total size of training set:',gender_age_test.shape[0]))

Size of training set without events:    51336
Size of training set with events:       23309
Total size of training set:             74645
Size of testing set without events:     76877
Size of training set with events:       35194
Total size of training set:             112071


In [8]:
gender_age_train_with['int_index'] = np.arange(gender_age_train_with.shape[0])
gender_age_test_with['int_index'] = np.arange(gender_age_test_with.shape[0])
gender_age_train_without['int_index'] = np.arange(gender_age_train_without.shape[0])
gender_age_test_without['int_index'] = np.arange(gender_age_test_without.shape[0])

# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [12]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train_with['brand'] = device['brand']
gender_age_test_with['brand'] = device['brand']
gender_age_train_without['brand'] = device['brand']
gender_age_test_without['brand'] = device['brand']
brandnumber = len(encoder.classes_)

Create sparse matrix with each row representing one device and each column representing one brand

In [13]:
X_train_brand_with = csr_matrix((np.ones(gender_age_train_with.shape[0]),
                            (gender_age_train_with.int_index,gender_age_train_with.brand)),
                               shape = (gender_age_train_with.shape[0],brandnumber))
X_test_brand_with = csr_matrix((np.ones(gender_age_test_with.shape[0]),
                           (gender_age_test_with.int_index,gender_age_test_with.brand)),
                              shape = (gender_age_test_with.shape[0],brandnumber))
print('X_train_brand_with shape:',X_train_brand_with.shape)
print('X_test_brand_with shape:',X_test_brand_with.shape)
X_train_brand_without = csr_matrix((np.ones(gender_age_train_without.shape[0]),
                            (gender_age_train_without.int_index,gender_age_train_without.brand)),
                                  shape = (gender_age_train_without.shape[0],brandnumber))
X_test_brand_without = csr_matrix((np.ones(gender_age_test_without.shape[0]),
                           (gender_age_test_without.int_index,gender_age_test_without.brand)),
                                 shape = (gender_age_test_without.shape[0],brandnumber))
print('X_train_brand_without shape:',X_train_brand_without.shape)
print('X_test_brand_without shape:',X_test_brand_without.shape)

X_train_brand_with shape: (23309, 131)
X_test_brand_with shape: (35194, 131)
X_train_brand_without shape: (51336, 131)
X_test_brand_without shape: (76877, 131)


# Feature engineering II: phone device model

In [34]:
brand_model = device.phone_brand.str.cat(device.device_model)
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)
gender_age_train_with['model'] = device['model']
gender_age_test_with['model'] = device['model']
gender_age_train_without['model'] = device['model']
gender_age_test_without['model'] = device['model']
modelnumber = len(encoder3.classes_)

In [38]:
X_train_model_with = csr_matrix((np.ones(gender_age_train_with.shape[0]),
                                 (gender_age_train_with.int_index,gender_age_train_with.model)),
                               shape = (gender_age_train_with.shape[0],modelnumber))
X_test_model_with = csr_matrix((np.ones(gender_age_test_with.shape[0]),
                                (gender_age_test_with.int_index,gender_age_test_with.model)),
                              shape = (gender_age_test_with.shape[0],modelnumber))
X_train_model_without = csr_matrix((np.ones(gender_age_train_without.shape[0]),
                                    (gender_age_train_without.int_index,gender_age_train_without.model)),
                                  shape = (gender_age_train_without.shape[0],modelnumber))
X_test_model_without = csr_matrix((np.ones(gender_age_test_without.shape[0]),
                                   (gender_age_test_without.int_index,gender_age_test_without.model)),
                                 shape = (gender_age_test_without.shape[0],modelnumber))

print('X_train_brand_with shape:',X_train_model_with.shape)
print('X_test_brand_with shape:',X_test_model_with.shape)
print('X_train_brand_without shape:',X_train_model_without.shape)
print('X_test_brand_without shape:',X_test_model_without.shape)

X_train_brand_with shape: (23309, 1667)
X_test_brand_with shape: (35194, 1667)
X_train_brand_without shape: (51336, 1667)
X_test_brand_without shape: (76877, 1667)


# Feature engineering III: Installed app

Encoding the app_id and store it into app column, and feed this as a feature into the training and testing test.

In [39]:
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [40]:
np.sort(app_event.app.unique())

array([    0,     1,     2, ..., 19234, 19235, 19236], dtype=int64)

In [41]:
print(app_event.head(1))
print(events.head(1))

   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
                  device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24


In [42]:
installed_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_installed']],
                         how='right',right_on = 'event_id',left_index = True)
print(installed_app.head())

             device_id  event_id    app  is_installed
0 -6401643145415154744         2  15408             1
1 -6401643145415154744         2   3384             1
2 -6401643145415154744         2   7620             1
3 -6401643145415154744         2   8902             1
4 -6401643145415154744         2  18686             1


In [43]:
installed_app_grouped = installed_app.groupby(['device_id','app'])['app'].agg(['size'])
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_app_train_with = pd.merge(installed_app_grouped,gender_age_train_with[['int_index']],
                               how = 'right',right_index=True,left_index=True)
installed_app_test_with = pd.merge(installed_app_grouped,gender_age_test_with[['int_index']],
                              how = 'right',right_index=True,left_index=True)
print('installed_app_train:')
print(installed_app_train_with.head())

installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_app_train:
                           size  int_index
device_id            app                  
-9222956879900151005 548     18       5145
                     1096    18       5145
                     1248    26       5145
                     1545    12       5145
                     1664    18       5145


In [44]:
installed_app_train_with = installed_app_train_with.reset_index()
installed_app_test_with = installed_app_test_with.reset_index()
installed_app_train_with = installed_app_train_with.dropna(subset=['int_index'])
installed_app_test_with = installed_app_test_with.dropna(subset=['int_index'])
print(installed_app_train_with.head())
print(installed_app_test_with.head())

             device_id   app  size  int_index
0 -9222956879900151005   548    18       5145
1 -9222956879900151005  1096    18       5145
2 -9222956879900151005  1248    26       5145
3 -9222956879900151005  1545    12       5145
4 -9222956879900151005  1664    18       5145
             device_id    app  size  int_index
0 -9222661944218806987   1867     3       2851
1 -9222661944218806987   7519     8       2851
2 -9222661944218806987   7843     1       2851
3 -9222661944218806987   8704     4       2851
4 -9222661944218806987  10000     1       2851


Calculate the number of unique app:

In [45]:
print(gender_age_test_with.shape[0])
print(installed_app_train_with.shape[0])

35194
915632


In [46]:
print('The number of unique apps:')
print(np.size(installed_app.app.unique()))
appnumber = np.size(installed_app.app.unique())

The number of unique apps:
19237


In [47]:
print(np.sort(installed_app_train_with.app.unique()))
print(installed_app_test_with.shape[0])

[    0     1     2 ..., 19234 19235 19236]
1387337


In [48]:
X_train_installed_with = csr_matrix((np.ones(installed_app_train_with.shape[0]),
                                (installed_app_train_with.int_index,installed_app_train_with.app)), 
                               shape = (gender_age_train_with.shape[0],appnumber))
X_test_installed_with = csr_matrix((np.ones(installed_app_test_with.shape[0]),
                               (installed_app_test_with.int_index,installed_app_test_with.app)),
                               shape = (gender_age_test_with.shape[0],appnumber))
print('X_train_installed shape:',X_train_installed_with.shape)
print('X_test_installed shape:',X_test_installed_with.shape)

X_train_installed shape: (23309, 19237)
X_test_installed shape: (35194, 19237)


In [49]:
np.sort(installed_app_test_with.int_index.unique())

array([    0,     1,     2, ..., 35191, 35192, 35193], dtype=int64)

# Feature engineering IV: app label

In [50]:
print(app_event[['app_id','event_id']].head(1))
print(app_label[['app_id','label_id']].head(1))

                app_id  event_id
0  5927333115845830913         2
                app_id  label_id
0  7324884708820027918       251


In [51]:
app_label_new = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())].copy()
app_label_new['app'] = encoder2.transform(app_label_new.app_id)
encoder4 = LabelEncoder().fit(app_label_new.label_id)
app_label_new['label'] = encoder4.transform(app_label_new.label_id)
labelnumber = len(encoder4.classes_)
print('app_label_new:')
print(app_label_new.head())

app_label_new:
                app_id  label_id    app  label
0  7324884708820027918       251  17355    207
1 -4494216993218550286       251   4618    207
2  6058196446775239644       406  15548    247
3  6058196446775239644       407  15548    248
4  8694625920731541625       406  18689    247


In [52]:
print(app_label.size)
print(installed_app.size)
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_label_grouped = (installed_app_grouped.reset_index()[['device_id','app']]
                          .merge(app_label_new[['app','label']])
                          .groupby(['device_id','label']))['app'].agg(['size']).reset_index()
                          
print('installed_label_grouped:')
print(installed_label_grouped.head())

919886
129892268
installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_label_grouped:
             device_id  label  size
0 -9222956879900151005    117     1
1 -9222956879900151005    120     1
2 -9222956879900151005    126     1
3 -9222956879900151005    138     2
4 -9222956879900151005    147     2


In [53]:
label_app_train_with = pd.merge(installed_label_grouped,gender_age_train_with[['int_index']],
                               how = 'right',right_index=True,left_on='device_id')
label_app_test_with = pd.merge(installed_label_grouped,gender_age_test_with[['int_index']],
                              how = 'right',right_index=True,left_on ='device_id' )
label_app_train_with = label_app_train_with.dropna(subset= ['int_index','label'])
label_app_test_with = label_app_test_with.dropna(subset= ['int_index','label'])

In [55]:
X_train_label_with = csr_matrix((np.ones(label_app_train_with.shape[0]),
                                 (label_app_train_with.int_index,label_app_train_with.label)),
                              shape = (gender_age_train_with.shape[0],labelnumber))
X_test_label_with = csr_matrix((np.ones(label_app_test_with.shape[0]),(label_app_test_with.int_index,label_app_test_with.label)),
                              shape = (gender_age_test_with.shape[0],labelnumber))
print('X_train_installed_with shape:',X_train_label_with.shape)
print('X_test_installed_with shape:',X_test_label_with.shape)

X_train_installed_with shape: (23309, 492)
X_test_installed_with shape: (35194, 492)


# Feature engineering V: active app

In [56]:
active_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_active']],
                         how='right',right_on = 'event_id',left_index = True)
active_app = active_app.loc[active_app.is_active==1]
print(active_app.head())

             device_id  event_id    app  is_active
0 -6401643145415154744         2  15408          1
3 -6401643145415154744         2   8902          1
4 -6401643145415154744         2  18686          1
5 -6401643145415154744         2  14346          1
9 -6401643145415154744         2  16908          1


In [57]:
active_app_grouped = active_app.groupby(['device_id','app'])['app'].agg(['size'])
print('active_app_grouped:')
print(active_app_grouped.head())

active_app_grouped:
                           size
device_id            app       
-9222956879900151005 548      4
                     1248    15
                     1545     2
                     1848    31
                     2236    17


In [58]:
active_app_train_with = pd.merge(active_app_grouped,gender_age_train_with[['int_index']],
                               how = 'left',right_index=True,left_index=True)
active_app_test_with = pd.merge(active_app_grouped,gender_age_test_with[['int_index']],
                              how = 'left',right_index=True,left_index=True)
print('active_app_train_with:')
print(active_app_train_with.head())

active_app_train_with:
                           size  int_index
device_id            app                  
-9222956879900151005 548      4       5145
                     1248    15       5145
                     1545     2       5145
                     1848    31       5145
                     2236    17       5145


In [59]:
active_app_train_with = active_app_train_with.reset_index()
active_app_test_with = active_app_test_with.reset_index()
active_app_train_with = active_app_train_with.dropna(subset=['int_index'])
active_app_test_with = active_app_test_with.dropna(subset=['int_index'])
print(active_app_train_with.head())
print(active_app_test_with.head())

             device_id   app  size  int_index
0 -9222956879900151005   548     4       5145
1 -9222956879900151005  1248    15       5145
2 -9222956879900151005  1545     2       5145
3 -9222956879900151005  1848    31       5145
4 -9222956879900151005  2236    17       5145
              device_id    app  size  int_index
55 -9222661944218806987   1867     3       2851
56 -9222661944218806987   7519     7       2851
57 -9222661944218806987   7843     1       2851
58 -9222661944218806987   8704     3       2851
59 -9222661944218806987  10000     1       2851


In [60]:
print(np.sort(active_app_train_with.app.unique()))
print(appnumber)

[    0     5     6 ..., 19225 19228 19236]
19237


In [62]:
# binary active map
#X_train_active_with = csr_matrix((np.ones(active_app_train_with.shape[0]),
#                            (active_app_train_with.int_index,active_app_train_with.app)), 
#                            shape = (gender_age_train_with.shape[0],appnumber))
#X_test_active = csr_matrix((np.ones(active_app_test_with.shape[0]),
#                            (active_app_test.int_index,active_app_test_with.app)),
#                            shape = (gender_age_test_with.shape[0],appnumber))
# count the number of active app
X_train_active_with = csr_matrix((np.log(np.log(active_app_train_with['size']+1)+1),
                            (active_app_train_with.int_index,active_app_train_with.app)), 
                            shape = (gender_age_train_with.shape[0],appnumber))
X_test_active_with = csr_matrix((np.log(np.log(active_app_test_with['size']+1)+1),
                            (active_app_test_with.int_index,active_app_test_with.app)),
                            shape = (gender_age_test_with.shape[0],appnumber))
print('X_train_active shape:',X_train_active_with.shape)
print('X_test_active shape:',X_test_active_with.shape)

X_train_active shape: (23309, 19237)
X_test_active shape: (35194, 19237)


# Feature engineering VI: active time period

In [63]:
events.head()

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [64]:
events_time = events[['device_id','timestamp']].copy()
events_time['time']  = events_time['timestamp'].str[11:13].astype(int)
events_time.drop(['timestamp'],axis=1,inplace=True)
events_time = events_time.groupby(['device_id','time'])['time'].agg({'times':'count'}).reset_index()
print(events_time.head())
timenumber= events_time.time.unique().shape[0]

             device_id  time  times
0 -9222956879900151005     7      2
1 -9222956879900151005    11      7
2 -9222956879900151005    12     13
3 -9222956879900151005    13      3
4 -9222956879900151005    14      5


In [66]:
time_train_with = pd.merge(events_time,gender_age_train_with[['int_index']],
                               right_index=True,left_on='device_id')
time_test_with = pd.merge(events_time,gender_age_test_with[['int_index']],
                               right_index=True,left_on ='device_id' )
#binary
#X_train_time_with = csr_matrix((np.ones(time_train_with.shape[0]),
#                            (time_train_with.int_index,time_train_with.time)), 
#                            shape = (gender_age_train_with.shape[0],timenumber))
#X_test_time_with = csr_matrix((np.ones(time_test_with.shape[0]),
#                            (time_test_with.int_index,time_test_with.time)),
#                            shape = (gender_age_test_with.shape[0],timenumber))

#number
X_train_time_with = csr_matrix((np.log(np.log(time_train_with['times']+1)+1),
                            (time_train_with.int_index,time_train_with.time)), 
                            shape = (gender_age_train_with.shape[0],timenumber))
X_test_time_with = csr_matrix((np.log(np.log(time_test_with['times']+1)+1),
                            (time_test_with.int_index,time_test_with.time)),
                            shape = (gender_age_test_with.shape[0],timenumber))
print('X_train_time_with shape:',X_train_time_with.shape)
print('X_test_time_with shape:',X_test_time_with.shape)

X_train_time_with shape: (23309, 24)
X_test_time_with shape: (35194, 24)


Normalization of the time period count.

In [67]:
'''scaler = StandardScaler(with_mean=False)
X_train_time_with = scaler.fit_transform(X_train_time_with)
X_test_time_with = scaler.transform(X_test_time_with)'''

'scaler = StandardScaler(with_mean=False)\nX_train_time_with = scaler.fit_transform(X_train_time_with)\nX_test_time_with = scaler.transform(X_test_time_with)'

# feature join and selection

## Device without events

In [68]:
X_train_total_without = hstack((X_train_brand_without,X_train_model_without),format='csr')
X_test_total_without = hstack((X_test_brand_without,X_test_model_without),format='csr')
print('Training shape:')
print(X_train_total_without.shape)
print('Testing shape:')
print(X_test_total_without.shape)

Training shape:
(51336, 1798)
Testing shape:
(76877, 1798)


## Device with events

In [70]:
X_train_total_with = hstack((X_train_brand_with,X_train_model_with,
                             X_train_installed_with,X_train_label_with,
                            X_train_active_with,X_train_time_with),format='csr')
X_test_total_with = hstack((X_test_brand_with,X_test_model_with,
                           X_test_installed_with,X_test_label_with,
                            X_test_active_with,X_test_time_with),format='csr')
print('Training shape:')
print(X_train_total_with.shape)
print('Testing shape:')
print(X_test_total_with.shape)

Training shape:
(23309, 40788)
Testing shape:
(35194, 40788)


In [84]:
#percentile selction
#selector = SelectPercentile(f_classif, percentile=80)
#selector.fit(X_train_total, y_train_total)
#X_train_total = selector.transform(X_train_total)
#X_test_total = selector.transform(X_test_total)
#X_val.shape

# Selection using chi-square
#selector = SelectKBest(chi2, k=15155).fit(X_train_total, y_train_total)
#X_train_total = selector.transform(X_train_total)
#X_test_total = selector.transform(X_test_total)
#print('Training shape:')
#print(X_train_total.shape)
#print('Testing shape:')
#print(X_test_total.shape)

Training shape:
(74645, 15155)
Testing shape:
(112071, 15155)


# Deep learning model 

Defining functions:

In [68]:
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator 
    #(https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    #model.add(Dense(10, input_dim=X_train_total.shape[1], init='normal', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(150, input_dim=X_train_total.shape[1], init='normal', activation='tanh'))
    model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=X_train_total.shape[1], init='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

## device with events

In [71]:
targetencoder = LabelEncoder().fit(gender_age_train_with.group)
y_with = targetencoder.transform(gender_age_train_with.group)
nclasses = len(targetencoder.classes_)
dummy_y_with = np_utils.to_categorical(y_with)

NameError: name 'np_utils' is not defined

In [104]:
kf = KFold(len(y_train_total_with),n_folds = 10)
scores_val_list_with=[]
score_list_with=[]
for train, test in kf:
    X_train_with = X_train_total_with[train]
    y_train_with = dummy_y_with[train]
    X_val_with = X_train_total_with[test]
    y_val_with = dummy_y_with[test]
    #print(X_val.shape)
    print('*****************************************************')
    model=baseline_model()
    fit= model.fit_generator(generator=batch_generator(X_train_with, y_train_with, 400, True),
                         nb_epoch=10,
                         samples_per_epoch=69984,
                         validation_data=(X_val_with.todense(), y_val_with), verbose=2
                         )
    scores_val_with = model.predict_generator(generator=batch_generatorp(X_val_with, 32, False), 
                                         val_samples=X_val_with.shape[0])
    scores_val_list_with.append(scores_val_with)
    scores_with = model.predict_generator(generator=batch_generatorp(X_test_total_with, 32, False), 
                                     val_samples=X_test_total_with.shape[0])
    score_list.append(scores_with)
    print('logloss val {}'.format(log_loss(y_val_with, scores_val_with)))

*****************************************************
Epoch 1/10
48s - loss: 2.4395 - acc: 0.1322 - val_loss: 2.4139 - val_acc: 0.1273
Epoch 2/10
44s - loss: 2.3681 - acc: 0.1542 - val_loss: 2.3843 - val_acc: 0.1434
Epoch 3/10
44s - loss: 2.3241 - acc: 0.1711 - val_loss: 2.3619 - val_acc: 0.1561
Epoch 4/10
43s - loss: 2.3024 - acc: 0.1805 - val_loss: 2.3495 - val_acc: 0.1627
Epoch 5/10
43s - loss: 2.2824 - acc: 0.1878 - val_loss: 2.3434 - val_acc: 0.1641
Epoch 6/10
46s - loss: 2.2728 - acc: 0.1913 - val_loss: 2.3385 - val_acc: 0.1677
Epoch 7/10
46s - loss: 2.2650 - acc: 0.1940 - val_loss: 2.3345 - val_acc: 0.1705
Epoch 8/10
48s - loss: 2.2562 - acc: 0.1990 - val_loss: 2.3314 - val_acc: 0.1695
Epoch 9/10
46s - loss: 2.2462 - acc: 0.2023 - val_loss: 2.3297 - val_acc: 0.1725
Epoch 10/10
47s - loss: 2.2437 - acc: 0.2034 - val_loss: 2.3254 - val_acc: 0.1756
logloss val 2.3254444197794397
*****************************************************
Epoch 1/10
53s - loss: 2.4362 - acc: 0.1328 - val_



KeyboardInterrupt: 

In [None]:
for index,i in enumerate(val_loss_list_with):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
val_loss_ave_with = sumi/len(val_loss_list_with)
print('average logloss val {}'.format(val_loss_ave_with))
for index,i in enumerate(score_list_with):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
score_ave_with = sumi/len(score_list_with)
pred_with = pd.DataFrame(score_ave_with, index = gender_age_test_with.index, columns=targetencoder.classes_)

## device without events

In [71]:
targetencoder = LabelEncoder().fit(gender_age_train_without.group)
y_without = targetencoder.transform(gender_age_train_without.group)
nclasses = len(targetencoder.classes_)
dummy_y_without = np_utils.to_categorical(y_without)

NameError: name 'np_utils' is not defined

In [104]:
kf = KFold(len(y_train_total_without),n_folds = 10)
scores_val_list_without=[]
score_list_without=[]
for train, test in kf:
    X_train_without = X_train_total_without[train]
    y_train_without = dummy_y_without[train]
    X_val_without = X_train_total_without[test]
    y_val_without = dummy_y_without[test]
    #print(X_val.shape)
    print('*****************************************************')
    model=baseline_model()
    fit= model.fit_generator(generator=batch_generator(X_train_without, y_train_without, 400, True),
                         nb_epoch=10,
                         samples_per_epoch=69984,
                         validation_data=(X_val_without.todense(), y_val_without), verbose=2
                         )
    scores_val_without = model.predict_generator(generator=batch_generatorp(X_val_without, 32, False), 
                                         val_samples=X_val_without.shape[0])
    scores_val_list_without.append(scores_val_without)
    scores_without = model.predict_generator(generator=batch_generatorp(X_test_total_without, 32, False), 
                                     val_samples=X_test_total_without.shape[0])
    score_list.append(scores_without)
    print('logloss val {}'.format(log_loss(y_val_without, scores_val_without)))

*****************************************************
Epoch 1/10
48s - loss: 2.4395 - acc: 0.1322 - val_loss: 2.4139 - val_acc: 0.1273
Epoch 2/10
44s - loss: 2.3681 - acc: 0.1542 - val_loss: 2.3843 - val_acc: 0.1434
Epoch 3/10
44s - loss: 2.3241 - acc: 0.1711 - val_loss: 2.3619 - val_acc: 0.1561
Epoch 4/10
43s - loss: 2.3024 - acc: 0.1805 - val_loss: 2.3495 - val_acc: 0.1627
Epoch 5/10
43s - loss: 2.2824 - acc: 0.1878 - val_loss: 2.3434 - val_acc: 0.1641
Epoch 6/10
46s - loss: 2.2728 - acc: 0.1913 - val_loss: 2.3385 - val_acc: 0.1677
Epoch 7/10
46s - loss: 2.2650 - acc: 0.1940 - val_loss: 2.3345 - val_acc: 0.1705
Epoch 8/10
48s - loss: 2.2562 - acc: 0.1990 - val_loss: 2.3314 - val_acc: 0.1695
Epoch 9/10
46s - loss: 2.2462 - acc: 0.2023 - val_loss: 2.3297 - val_acc: 0.1725
Epoch 10/10
47s - loss: 2.2437 - acc: 0.2034 - val_loss: 2.3254 - val_acc: 0.1756
logloss val 2.3254444197794397
*****************************************************
Epoch 1/10
53s - loss: 2.4362 - acc: 0.1328 - val_



KeyboardInterrupt: 

In [None]:
for index,i in enumerate(val_loss_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
val_loss_ave_without = sumi/len(val_loss_list_without)
print('average logloss val {}'.format(val_loss_ave_without))
for index,i in enumerate(score_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
score_ave_without = sumi/len(score_list_without)
pred_without = pd.DataFrame(score_ave_without, index = gender_age_test_without.index, columns=targetencoder.classes_)

## putting together and save into final file

In [None]:
pred = pd.concat(pred_with,pred_without)
pred.to_csv('keras_v15.csv',index=True)

# Model training and fitting: Logistic Regression

In [36]:
targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)
print(target_len)

12


In [347]:
X_train,X_test,y_train,y_test = cv.train_test_split(X_train_total,y_train_total,test_size=0.005, random_state=42)

In [348]:
model = LogisticRegression(C=0.02,multi_class='multinomial',solver='newton-cg')
model.fit(X_train,y_train)

LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [349]:
y_pred = model.predict_proba(X_test)

In [350]:
log_loss(y_test,y_pred)

2.2749612582739513

In [351]:
model.score(X_test,y_test)

0.19901403922409175

In [352]:
result = model.fit(X_train_total,y_train_total)
pred = pd.DataFrame(model.predict_proba(X_test_total), index = gender_age_test.index, columns=targetencoder.classes_)
pred.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.00132,0.006996,0.013956,0.01176,0.033186,0.051509,0.010501,0.031108,0.069016,0.108805,0.245644,0.416199
-1547860181818787117,0.009032,0.01488,0.029417,0.06173,0.070525,0.148762,0.006344,0.118501,0.055331,0.067424,0.24089,0.177163
7374582448058474277,0.022811,0.042062,0.040158,0.136054,0.174151,0.07704,0.011671,0.024363,0.03914,0.101672,0.183772,0.147106
-6220210354783429585,0.003416,0.029785,0.009868,0.012436,0.046435,0.144354,0.065789,0.174171,0.068642,0.121052,0.169799,0.154253
-5893464122623104785,0.047394,0.065124,0.042721,0.062585,0.056626,0.043316,0.092342,0.164602,0.097607,0.102084,0.132622,0.092977


In [353]:
pred.to_csv('logreg_submission.csv',index=True)

# Model fitting :RF

In [234]:
model2 = rfc(n_estimators=300,verbose=1,n_jobs=-1)
model2.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [236]:
y_pred2 = model2.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.6s finished


In [237]:
log_loss(y_test,y_pred2)

3.2819344751732902