In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack

In [2]:
events = pd.read_csv('input/events.csv')
label = pd.read_csv('input/label_categories.csv')
app_event = pd.read_csv('input/app_events.csv')
app_label = pd.read_csv('input/app_labels.csv')
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [3]:
gender_age_train['int_index'] = np.arange(gender_age_train.shape[0])
gender_age_test['int_index'] = np.arange(gender_age_test.shape[0])

In [4]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group  int_index
device_id                                          
-8076087639492063270      M   35  M32-38          0

gender_age_test
                      int_index
device_id                     
1002079943728939269          0

device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米

events
    event_id          device_id            timestamp  longitude  latitude
0         1  29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [5]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train['brand'] = device['brand']
gender_age_test['brand'] = device['brand']

Create sparse matrix with each row representing one device and each column representing one brand.

In [6]:
X_train_brand = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.brand)))
X_test_brand = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.brand)))
print('X_train_brand shape:',X_train_brand.shape)
print('X_test_brand shape:',X_test_brand.shape)

X_train_brand shape: (74645, 131)
X_test_brand shape: (112071, 131)


# Feature engineering II: Installed app

Encoding the app_id and store it into app column, and feed this as a feature into the training and testing test.

In [7]:
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [8]:
np.sort(app_event.app.unique())

array([    0,     1,     2, ..., 19234, 19235, 19236], dtype=int64)

In [9]:
print(app_event.head(1))
print(events.head(1))

   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
   event_id          device_id            timestamp  longitude  latitude
0         1  29182687948017175  2016-05-01 00:55:25     121.38     31.24


In [10]:
installed_app = pd.merge(events[['event_id','device_id']],app_event[['event_id','app','is_installed']])

In [11]:
installed_app_train = pd.merge(installed_app,gender_age_train[['int_index']],right_index=True,left_on = 'device_id')
installed_app_test = pd.merge(installed_app,gender_age_test[['int_index']],right_index=True,left_on = 'device_id')

In [12]:
print(installed_app_train.head())
print(installed_app_test.head())

    event_id            device_id    app  is_installed  int_index
19         6  1476664663289716375   1819             1      51154
20         6  1476664663289716375  15784             1      51154
21         6  1476664663289716375  15408             1      51154
22         6  1476664663289716375  13094             1      51154
23         6  1476664663289716375   5733             1      51154
   event_id            device_id    app  is_installed  int_index
0         2 -6401643145415154744  15408             1      68691
1         2 -6401643145415154744   3384             1      68691
2         2 -6401643145415154744   7620             1      68691
3         2 -6401643145415154744   8902             1      68691
4         2 -6401643145415154744  18686             1      68691


Calculate the number of unique app:

In [13]:
print('The number of unique apps:')
print(np.size(installed_app.app.unique()))
appnumber = np.size(installed_app.app.unique())

The number of unique apps:
19237


In [14]:
gender_age_test.shape[0]

112071

In [15]:
print(np.sort(installed_app_train.app.unique()))
print(installed_app_test.shape[0])

[    0     1     2 ..., 19234 19235 19236]
19563071


In [16]:
X_train_installed = csr_matrix((np.ones(installed_app_train.shape[0]),(installed_app_train.int_index,installed_app_train.app)),
                              shape = (gender_age_train.shape[0],appnumber))
X_test_installed = csr_matrix((np.ones(installed_app_test.shape[0]),(installed_app_test.int_index,installed_app_test.app)),
                             shape = (gender_age_test.shape[0],appnumber))
print('X_train_installed shape:',X_train_installed.shape)
print('X_test_installed shape:',X_test_installed.shape)

X_train_installed shape: (74645, 19237)
X_test_installed shape: (112071, 19237)


In [17]:
np.sort(installed_app_test.int_index.unique())

array([     0,      1,      2, ..., 112059, 112060, 112063], dtype=int64)

# Feature engineering III: phone device model

Encoding the phone device model, and feed this as a feature into the training and testing test.

In [71]:
brand_model = device.phone_brand.str.cat(device.device_model)
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)
gender_age_train['model'] = device['model']
gender_age_test['model'] = device['model']

Create sparse matrix with each row representing one device and each column representing one model.

In [72]:
X_train_model = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.model)))
X_test_model = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.model)))
print('X_train_brand shape:',X_train_model.shape)
print('X_test_brand shape:',X_test_model.shape)

X_train_brand shape: (74645, 1667)
X_test_brand shape: (112071, 1667)


# Feature engineering IV: app label

# Model training and fitting

In [88]:
X_train_total = X_train_installed
X_test_total = X_test_installed
#X_train_total = hstack((X_train_installed,X_train_model),format='csr')
#X_test_total = hstack((X_test_installed,X_test_model),format='csr')
print('Training shape:')
print(X_train_total.shape)
print('Testing shape:')
print(X_test_total.shape)

Training shape:
(74645, 19237)
Testing shape:
(112071, 19237)


In [89]:
targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)
print(target_len)

12


In [90]:
X_train,X_test,y_train,y_test = cv.train_test_split(X_train_total,y_train_total,test_size = 0.4)

In [91]:
model = rfc(n_estimators=300,verbose=1,n_jobs=-1)

In [92]:
model.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [93]:
y_pred = model.predict_proba(X_test)
print(y_pred[1,:])

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    1.0s finished


[ 0.06        0.06        0.03333333  0.10333333  0.14333333  0.15333333
  0.04666667  0.08        0.04666667  0.07333333  0.07666667  0.12333333]


In [94]:
log_loss(y_test,y_pred)

2.349907635400081

In [95]:
model.score(X_test,y_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.9s finished


0.17261705405586442

In [97]:
model.fit(X_train_total,y_train_total)
pred = pd.DataFrame(model.predict_proba(X_test_total), index = gender_age_test.index, columns=targetencoder.classes_)
pred.head()

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.7s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    4.7s finished


Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.006667,0.026667,0.033333,0.063333,0.096667,0.076667,0.03,0.073333,0.073333,0.113333,0.21,0.196667
-1547860181818787117,0.013333,0.023333,0.016667,0.046667,0.116667,0.056667,0.016667,0.056667,0.076667,0.096667,0.23,0.25
7374582448058474277,0.053333,0.063333,0.036667,0.12,0.08,0.126667,0.056667,0.076667,0.036667,0.07,0.043333,0.236667
-6220210354783429585,0.036667,0.05,0.03,0.03,0.09,0.073333,0.066667,0.12,0.063333,0.073333,0.12,0.246667
-5893464122623104785,0.07177,0.059447,0.042016,0.063017,0.072494,0.055324,0.108757,0.132979,0.073415,0.095202,0.119626,0.105952


In [98]:
pred.to_csv('rf_submission_v2.csv',index=True)

# Feature engineering III: installed app label

In [19]:
installed_label = pd.merge(app_event,app_label,on='app_id')

MemoryError: 

In [None]:
installed_label = installed_label.drop(['app_id'],axis=1)

In [58]:
app_event.groupby('app_id').count()

Unnamed: 0_level_0,event_id,is_installed,is_active
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9221156934682287334,21,21,21
-9220899153371182692,25,25,25
-9218487885271516150,2,2,2
-9218487885267037129,6,6,6
-9218310540360546691,38,38,38
-9217104312935103667,38,38,38
-9216716044975227433,1777,1777,1777
-9216547119863430601,352,352,352
-9216245512488580977,113,113,113
-9215708428016154392,14,14,14


In [59]:
app_event_new.head(1)

Unnamed: 0,event_id,is_installed,is_active,label_id
0,2,1,1,549


In [None]:
print(events.loc[events.device_id==1186608308763918427,:].event_id.count())
device.loc[:,'phone_brand'].unique()