In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack

In [2]:
events = pd.read_csv('input/events.csv',index_col='event_id')
label = pd.read_csv('input/label_categories.csv')
app_event = pd.read_csv('input/app_events.csv')
app_label = pd.read_csv('input/app_labels.csv')
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [3]:
gender_age_train['int_index'] = np.arange(gender_age_train.shape[0])
gender_age_test['int_index'] = np.arange(gender_age_test.shape[0])

In [4]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group  int_index
device_id                                          
-8076087639492063270      M   35  M32-38          0

gender_age_test
                      int_index
device_id                     
1002079943728939269          0

device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米

events
                   device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [5]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train['brand'] = device['brand']
gender_age_test['brand'] = device['brand']

Create sparse matrix with each row representing one device and each column representing one brand.

In [6]:
X_train_brand = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.brand)))
X_test_brand = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.brand)))
print('X_train_brand shape:',X_train_brand.shape)
print('X_test_brand shape:',X_test_brand.shape)

X_train_brand shape: (74645, 131)
X_test_brand shape: (112071, 131)


# Feature engineering II: Installed app

Encoding the app_id and store it into app column, and feed this as a feature into the training and testing test.

In [7]:
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [8]:
np.sort(app_event.app.unique())

array([    0,     1,     2, ..., 19234, 19235, 19236], dtype=int64)

In [9]:
print(app_event.head(1))
print(events.head(1))

   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
                  device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24


In [10]:
installed_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_installed']],how='right',right_on = 'event_id',
                        left_index = True)

In [11]:
print(installed_app.head(1))
installed_app_grouped = installed_app.groupby(['device_id','app'])['app'].agg(['size'])

             device_id  event_id    app  is_installed
0 -6401643145415154744         2  15408             1


In [12]:
installed_app_train = pd.merge(installed_app_grouped,gender_age_train[['int_index']],how = 'right',right_index=True,left_index=True)
installed_app_test = pd.merge(installed_app_grouped,gender_age_test[['int_index']],how = 'right',right_index=True,left_index=True)

In [13]:
installed_app_train = installed_app_train.reset_index()
installed_app_test = installed_app_test.reset_index()
print(installed_app_train.head())
print(installed_app_test.head())

             device_id   app  size  int_index
0 -9222956879900151005   548    18      21594
1 -9222956879900151005  1096    18      21594
2 -9222956879900151005  1248    26      21594
3 -9222956879900151005  1545    12      21594
4 -9222956879900151005  1664    18      21594
             device_id    app  size  int_index
0 -9222661944218806987   1867     3      13612
1 -9222661944218806987   7519     8      13612
2 -9222661944218806987   7843     1      13612
3 -9222661944218806987   8704     4      13612
4 -9222661944218806987  10000     1      13612


Calculate the number of unique app:

In [14]:
print('The number of unique apps:')
print(np.size(installed_app.app.unique()))
appnumber = np.size(installed_app.app.unique())

The number of unique apps:
19237


In [15]:
gender_age_test.shape[0]

112071

In [16]:
print(np.sort(installed_app_train.app.unique()))
print(installed_app_test.shape[0])

[    0     1     2 ..., 19234 19235 19236]
1387337


In [17]:
#the first set use the number of usage of each app (seems wrong)
#X_train_installed = csr_matrix((installed_app_train['size'],(installed_app_train.int_index,installed_app_train.app)),
#                              shape = (gender_age_train.shape[0],appnumber))
#X_test_installed = csr_matrix((installed_app_test['size'],(installed_app_test.int_index,installed_app_test.app)),
#                              shape = (gender_age_test.shape[0],appnumber))
#the second set use whether the app got used or not
X_train_installed = csr_matrix((np.ones(installed_app_train.shape[0]),(installed_app_train.int_index,installed_app_train.app)),
                              shape = (gender_age_train.shape[0],appnumber))
X_test_installed = csr_matrix((np.ones(installed_app_test.shape[0]),(installed_app_test.int_index,installed_app_test.app)),
                             shape = (gender_age_test.shape[0],appnumber))
print('X_train_installed shape:',X_train_installed.shape)
print('X_test_installed shape:',X_test_installed.shape)

X_train_installed shape: (74645, 19237)
X_test_installed shape: (112071, 19237)


In [18]:
np.sort(installed_app_test.int_index.unique())

array([     0,      1,      2, ..., 112059, 112060, 112063], dtype=int64)

# Feature engineering III: phone device model

Encoding the phone device model, and feed this as a feature into the training and testing test.

In [19]:
brand_model = device.phone_brand.str.cat(device.device_model)
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)
gender_age_train['model'] = device['model']
gender_age_test['model'] = device['model']

Create sparse matrix with each row representing one device and each column representing one model.

In [20]:
X_train_model = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.model)))
X_test_model = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.model)))
print('X_train_brand shape:',X_train_model.shape)
print('X_test_brand shape:',X_test_model.shape)

X_train_brand shape: (74645, 1667)
X_test_brand shape: (112071, 1667)


# Feature engineering IV: app label

In [21]:
print(app_event[['app_id','event_id']].head(1))
print(app_label[['app_id','label_id']].head(1))

                app_id  event_id
0  5927333115845830913         2
                app_id  label_id
0  7324884708820027918       251


In [22]:
app_label = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())]
app_label['app'] = encoder2.transform(app_label.app_id)
encoder4 = LabelEncoder().fit(app_label.label_id)
app_label['label'] = encoder4.transform(app_label.label_id)
labelnumber = len(encoder4.classes_)

In [23]:
print(app_label.size)
print(installed_app.size)
print(app_label.head(1))
print(installed_app_grouped.head(1))
installed_app_grouped = installed_app_grouped.reset_index().merge(app_label[['app','label']],how = 'left')

312816
129892268
                app_id  label_id    app  label
0  7324884708820027918       251  17355    207
                          size
device_id            app      
-9222956879900151005 548    18


In [24]:
label_app_train = pd.merge(installed_app_grouped,gender_age_train[['int_index']],
                               how = 'right',right_index=True,left_on='device_id')
label_app_test = pd.merge(installed_app_grouped,gender_age_test[['int_index']],
                              how = 'right',right_index=True,left_on ='device_id' )

In [25]:
label_app_train = label_app_train.dropna(subset=['label'])
label_app_test = label_app_test.dropna(subset=['label'])

In [26]:
X_train_label = csr_matrix((np.ones(label_app_train.shape[0]),(label_app_train.int_index,label_app_train.label)),
                              shape = (gender_age_train.shape[0],labelnumber))
X_test_label = csr_matrix((np.ones(label_app_test.shape[0]),(label_app_test.int_index,label_app_test.label)),
                              shape = (gender_age_test.shape[0],labelnumber))
print('X_train_installed shape:',X_train_label.shape)
print('X_test_installed shape:',X_test_label.shape)

X_train_installed shape: (74645, 492)
X_test_installed shape: (112071, 492)


# Model training and fitting

In [34]:
#X_train_total = X_train_label
#X_test_total = X_test_label
X_train_total = hstack((X_train_installed,X_train_label),format='csr')
X_test_total = hstack((X_test_installed,X_test_label),format='csr')
print('Training shape:')
print(X_train_total.shape)
print('Testing shape:')
print(X_test_total.shape)

Training shape:
(74645, 19729)
Testing shape:
(112071, 19729)


In [35]:
targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)
print(target_len)

12


In [36]:
X_train,X_test,y_train,y_test = cv.train_test_split(X_train_total,y_train_total,test_size = 0.4)

In [37]:
model = rfc(n_estimators=300,verbose=1,n_jobs=-1)

In [None]:
model.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s


In [None]:
y_pred = model.predict_proba(X_test)
print(y_pred[1,:])

In [None]:
log_loss(y_test,y_pred)

In [None]:
model.score(X_test,y_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.6s finished


0.16856453881706746

In [None]:
model.fit(X_train_total,y_train_total)
pred = pd.DataFrame(model.predict_proba(X_test_total), index = gender_age_test.index, columns=targetencoder.classes_)
pred.head()

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.8s


In [70]:
pred.to_csv('rf_submission_v2.csv',index=True)

# Feature engineering III: installed app label

In [19]:
installed_label = pd.merge(app_event,app_label,on='app_id')

MemoryError: 

In [None]:
installed_label = installed_label.drop(['app_id'],axis=1)

In [58]:
app_event.groupby('app_id').count()

Unnamed: 0_level_0,event_id,is_installed,is_active
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9221156934682287334,21,21,21
-9220899153371182692,25,25,25
-9218487885271516150,2,2,2
-9218487885267037129,6,6,6
-9218310540360546691,38,38,38
-9217104312935103667,38,38,38
-9216716044975227433,1777,1777,1777
-9216547119863430601,352,352,352
-9216245512488580977,113,113,113
-9215708428016154392,14,14,14


In [59]:
app_event_new.head(1)

Unnamed: 0,event_id,is_installed,is_active,label_id
0,2,1,1,549


In [None]:
print(events.loc[events.device_id==1186608308763918427,:].event_id.count())
device.loc[:,'phone_brand'].unique()