In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from scipy.sparse import csr_matrix,hstack

In [2]:
events = pd.read_csv('input/events.csv')
label = pd.read_csv('input/label_categories.csv')
app_event = pd.read_csv('input/app_events.csv')
app_label = pd.read_csv('input/app_labels.csv')
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [36]:
gender_age_train['int_index'] = np.arange(gender_age_train.shape[0])
gender_age_test['int_index'] = np.arange(gender_age_test.shape[0])

In [37]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group  brand  int_index
device_id                                                 
-8076087639492063270      M   35  M32-38     51          0

gender_age_test
                      brand  int_index
device_id                            
1002079943728939269     51          0

device
                      phone_brand device_model  brand
device_id                                           
-8890648629457979026          小米           红米     51

events
    event_id          device_id            timestamp  longitude  latitude
0         1  29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [24]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train['brand'] = device['brand']
gender_age_test['brand'] = device['brand']

Create sparse matrix with each row representing one device and each column representing one brand

In [48]:
X_train_brand = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.brand)))
X_test_brand = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.brand)))
print('X_train_brand shape:',X_train_brand.shape)
print('X_test_brand shape:',X_test_brand.shape)

X_train_brand shape: (74645, 131)
X_test_brand shape: (112071, 131)


# Feature engineering II: app usage

In [4]:
app_event_new = pd.merge(app_event,app_label)

In [6]:
app_event_new = app_event_new.drop(['app_id'],axis=1)

In [7]:
app_event_new.head(1)

Unnamed: 0,event_id,is_installed,is_active,label_id
0,2,1,1,549


In [None]:
print(events.loc[events.device_id==1186608308763918427,:].event_id.count())
device.loc[:,'phone_brand'].unique()

In [67]:
app_event.loc[app_event.event_id==2,:]

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1
5,2,4775896950989639373,1,1
6,2,-8022267440849930066,1,0
7,2,9112463267739110219,1,0
8,2,-3725672010020973973,1,0
9,2,7167114343576723123,1,1


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406


In [53]:
label.head(3)

Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes
