In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack




In [3]:
events = pd.read_csv('input/events.csv',index_col='event_id')
label = pd.read_csv('input/label_categories.csv')
app_event = pd.read_csv('input/app_events.csv')
app_label = pd.read_csv('input/app_labels.csv')
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [4]:
gender_age_train['int_index'] = np.arange(gender_age_train.shape[0])
gender_age_test['int_index'] = np.arange(gender_age_test.shape[0])

In [5]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group  int_index
device_id                                          
-8076087639492063270      M   35  M32-38          0

gender_age_test
                      int_index
device_id                     
1002079943728939269          0

device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米

events
                   device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [6]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train['brand'] = device['brand']
gender_age_test['brand'] = device['brand']

Create sparse matrix with each row representing one device and each column representing one brand.

In [7]:
X_train_brand = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.brand)))
X_test_brand = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.brand)))
print('X_train_brand shape:',X_train_brand.shape)
print('X_test_brand shape:',X_test_brand.shape)

X_train_brand shape: (74645, 131)
X_test_brand shape: (112071, 131)


# Feature engineering II: Installed app

Encoding the app_id and store it into app column, and feed this as a feature into the training and testing test.

In [8]:
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [9]:
np.sort(app_event.app.unique())

array([    0,     1,     2, ..., 19234, 19235, 19236])

In [10]:
print(app_event.head(1))
print(events.head(1))

   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
                  device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24


In [11]:
installed_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_installed']],how='right',right_on = 'event_id',
                        left_index = True)

In [12]:
print(installed_app.head(1))
installed_app_grouped = installed_app.groupby(['device_id','app'])['app'].agg(['size'])

             device_id  event_id    app  is_installed
0 -6401643145415154744         2  15408             1


In [13]:
installed_app_train = pd.merge(installed_app_grouped,gender_age_train[['int_index']],how = 'right',right_index=True,left_index=True)
installed_app_test = pd.merge(installed_app_grouped,gender_age_test[['int_index']],how = 'right',right_index=True,left_index=True)

In [14]:
installed_app_train = installed_app_train.reset_index()
installed_app_test = installed_app_test.reset_index()
print(installed_app_train.head())
print(installed_app_test.head())

             device_id   app  size  int_index
0 -9222956879900151005   548    18      21594
1 -9222956879900151005  1096    18      21594
2 -9222956879900151005  1248    26      21594
3 -9222956879900151005  1545    12      21594
4 -9222956879900151005  1664    18      21594
             device_id    app  size  int_index
0 -9222661944218806987   1867     3      13612
1 -9222661944218806987   7519     8      13612
2 -9222661944218806987   7843     1      13612
3 -9222661944218806987   8704     4      13612
4 -9222661944218806987  10000     1      13612


Calculate the number of unique app:

In [15]:
print('The number of unique apps:')
print(np.size(installed_app.app.unique()))
appnumber = np.size(installed_app.app.unique())

The number of unique apps:
19237


In [16]:
gender_age_test.shape[0]

112071

In [17]:
print(np.sort(installed_app_train.app.unique()))
print(installed_app_test.shape[0])

[    0     1     2 ..., 19234 19235 19236]
1387337


In [18]:
#the first set use the number of usage of each app (seems wrong)
#X_train_installed = csr_matrix((installed_app_train['size'],(installed_app_train.int_index,installed_app_train.app)),
#                              shape = (gender_age_train.shape[0],appnumber))
#X_test_installed = csr_matrix((installed_app_test['size'],(installed_app_test.int_index,installed_app_test.app)),
#                              shape = (gender_age_test.shape[0],appnumber))
#the second set use whether the app got used or not
X_train_installed = csr_matrix((np.ones(installed_app_train.shape[0]),(installed_app_train.int_index,installed_app_train.app)),
                              shape = (gender_age_train.shape[0],appnumber))
X_test_installed = csr_matrix((np.ones(installed_app_test.shape[0]),(installed_app_test.int_index,installed_app_test.app)),
                             shape = (gender_age_test.shape[0],appnumber))
print('X_train_installed shape:',X_train_installed.shape)
print('X_test_installed shape:',X_test_installed.shape)

X_train_installed shape: (74645, 19237)
X_test_installed shape: (112071, 19237)


In [19]:
np.sort(installed_app_test.int_index.unique())

array([     0,      1,      2, ..., 112059, 112060, 112063])

# Feature engineering III: phone device model

Encoding the phone device model, and feed this as a feature into the training and testing test.

In [20]:
brand_model = device.phone_brand.str.cat(device.device_model)
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)
gender_age_train['model'] = device['model']
gender_age_test['model'] = device['model']

Create sparse matrix with each row representing one device and each column representing one model.

In [21]:
X_train_model = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.model)))
X_test_model = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.model)))
print('X_train_brand shape:',X_train_model.shape)
print('X_test_brand shape:',X_test_model.shape)

X_train_brand shape: (74645, 1667)
X_test_brand shape: (112071, 1667)


# Feature engineering IV: app label

In [22]:
print(app_event[['app_id','event_id']].head(1))
print(app_label[['app_id','label_id']].head(1))

                app_id  event_id
0  5927333115845830913         2
                app_id  label_id
0  7324884708820027918       251


In [23]:
app_label_new = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())].copy()
app_label_new['app'] = encoder2.transform(app_label_new.app_id)
encoder4 = LabelEncoder().fit(app_label_new.label_id)
app_label_new['label'] = encoder4.transform(app_label_new.label_id)
labelnumber = len(encoder4.classes_)
print('app_label_new:')
print(app_label_new.head())

app_label_new:
                app_id  label_id    app  label
0  7324884708820027918       251  17355    207
1 -4494216993218550286       251   4618    207
2  6058196446775239644       406  15548    247
3  6058196446775239644       407  15548    248
4  8694625920731541625       406  18689    247


In [24]:
print(app_label.size)
print(installed_app.size)
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_label_grouped = (installed_app_grouped.reset_index()[['device_id','app']]
                          .merge(app_label_new[['app','label']])
                          .groupby(['device_id','label']))['app'].agg(['size']).reset_index()
                          
print('installed_label_grouped:')
print(installed_label_grouped.head())

919886
129892268
installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_label_grouped:
             device_id  label  size
0 -9222956879900151005    117     1
1 -9222956879900151005    120     1
2 -9222956879900151005    126     1
3 -9222956879900151005    138     2
4 -9222956879900151005    147     2


In [25]:
label_app_train = pd.merge(installed_label_grouped,gender_age_train[['int_index']],
                               how = 'right',right_index=True,left_on='device_id')
label_app_test = pd.merge(installed_label_grouped,gender_age_test[['int_index']],
                              how = 'right',right_index=True,left_on ='device_id' )
label_app_train = label_app_train.dropna(subset= ['int_index','label'])
label_app_test = label_app_test.dropna(subset= ['int_index','label'])

In [26]:
X_train_label = csr_matrix((np.ones(label_app_train.shape[0]),(label_app_train.int_index,label_app_train.label)),
                              shape = (gender_age_train.shape[0],labelnumber))
X_test_label = csr_matrix((np.ones(label_app_test.shape[0]),(label_app_test.int_index,label_app_test.label)),
                              shape = (gender_age_test.shape[0],labelnumber))
print('X_train_installed shape:',X_train_label.shape)
print('X_test_installed shape:',X_test_label.shape)

X_train_installed shape: (74645, 492)
X_test_installed shape: (112071, 492)


# Model training and fitting

In [27]:
#X_train_total = X_train_label
#X_test_total = X_test_label
X_train_total = hstack((X_train_installed,X_train_label,X_train_brand),format='csr')
X_test_total = hstack((X_test_installed,X_test_label,X_test_brand),format='csr')
print('Training shape:')
print(X_train_total.shape)
print('Testing shape:')
print(X_test_total.shape)

Training shape:
(74645, 19860)
Testing shape:
(112071, 19860)


In [28]:
targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)
print(target_len)

12


In [29]:
X_train,X_test,y_train,y_test = cv.train_test_split(X_train_total,y_train_total,test_size = 0.4)

In [30]:
model = rfc(n_estimators=300,verbose=1,n_jobs=-1)

In [31]:
model.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [32]:
y_pred = model.predict_proba(X_test)
print(y_pred[1,:])

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    2.9s finished


[ 0.07265241  0.042892    0.04089458  0.04056856  0.07993104  0.05800482
  0.12463612  0.1404576   0.05909045  0.0957532   0.1218356   0.12328363]


In [33]:
log_loss(y_test,y_pred)

2.408278254111671

In [34]:
model.score(X_test,y_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.9s finished


0.18015272288833814

In [35]:
model.fit(X_train_total,y_train_total)
pred = pd.DataFrame(model.predict_proba(X_test_total), index = gender_age_test.index, columns=targetencoder.classes_)
pred.head()

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    9.5s finished


Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.006667,0.02,0.01,0.05,0.1,0.073333,0.003333,0.066667,0.07,0.08,0.173333,0.346667
-1547860181818787117,0.006667,0.01,0.016667,0.066667,0.073333,0.106667,0.013333,0.076667,0.063333,0.13,0.2,0.236667
7374582448058474277,0.053333,0.083333,0.046667,0.153333,0.096667,0.1,0.066667,0.033333,0.05,0.09,0.053333,0.173333
-6220210354783429585,0.033333,0.02,0.013333,0.036667,0.063333,0.1,0.04,0.07,0.063333,0.116667,0.236667,0.206667
-5893464122623104785,0.071654,0.059258,0.041975,0.063093,0.072698,0.055254,0.108795,0.133016,0.07334,0.095288,0.119654,0.105973


In [36]:
pred.to_csv('rf_submission_v3.csv',index=True)