**Based on:** 
https://www.kaggle.com/anokas/talkingdata-mobile-user-demographics/sparse-xgboost-starter-2-26857/code
https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/a-linear-model-on-apps-and-labels

use phone model - first word only (with brand) to reduce number of models
add events on weekend  and weekday
add log(events) in each tod

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [None]:
# function to bin the timestamp in time of day

def bintod(x):
    if x < 3:
        return 0
    elif x < 8:
        return 1
    elif x < 20:
        return 2
    elif x < 23:
        return 3
    else:
        return 0 
# functions to assign region based on latitude and longitude
def lngregion(x):
    if x < 80:
        return 1
    elif x < 90:
        return 2
    elif x < 100:
        return 3
    elif x < 110:
        return 4
    elif x < 120:
        return 5
    elif x < 130:
        return 6
    elif x < 140:
        return 7
    else:
        return 0
nlng = 8
nlat = 9
def latregion(x):
    if x < 20:
        return 1
    elif x < 25:
        return 2
    elif x < 30:
        return 3
    elif x < 35:
        return 4
    elif x < 40:
        return 5
    elif x < 45:
        return 6
    elif x < 50:
        return 7
    elif x < 55:
        return 8    
    else:
        return 0      

In [None]:
datadir = '../input'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
labelcat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
labelcat['category']=labelcat['category'].fillna('label-missing')
labelcat.head()

In [None]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [None]:
# clean lat, long info, 0s are missing, also set out of china to missing
events['longitude'] = events['longitude'].round(0)
events['latitude'] = events['latitude'].round(0)

#set out of China to missing along with 0s
events['longitude'] = events['longitude'].clip_lower(73.0).replace(73.0, np.NaN) 
events['longitude'] = events['longitude'].clip_upper(135.0).replace(135.0, np.NaN) 
events['latitude'] = events['latitude'].clip_lower(15.0).replace(15.0, np.NaN)
events['latitude'] = events['latitude'].clip_upper(60.0).replace(60.0, np.NaN)

# lot of missing values - replace them with mode (most common lat, long)
events['latitude2'] =events.groupby(['device_id'])['latitude'].transform(lambda x: x.mode()) 
events['longitude2'] =events.groupby(['device_id'])['longitude'].transform(lambda x: x.mode())

In [None]:
# lat long location for each device
events_latlng = events[['device_id', 'latitude2','longitude2']].drop_duplicates('device_id', keep='first')
events_latlng = events_latlng.set_index('device_id')
print('Number of devices with some lat long info',len(events_latlng['latitude2']))
print('out of that missing longitude: ', sum(events_latlng['longitude2'].isnull())) 
print('out of that missing latitude: ', sum(events_latlng['latitude2'].isnull()))

events_latlng['lng_region'] = events_latlng['longitude2'].apply(lngregion)
events_latlng['lat_region'] = events_latlng['latitude2'].apply(latregion)

print ("Frequencies longitude region:" '\n', events_latlng['lng_region'].value_counts())
print ("Frequencies latitude region:" '\n', events_latlng['lat_region'].value_counts())

In [None]:
gatrain['lng_region'] = events_latlng['lng_region']
gatest['lng_region'] = events_latlng['lng_region']

#set missing region to 0
gatrain['lng_region'] = gatrain.lng_region.fillna(value=0.0)
gatest['lng_region'] = gatest.lng_region.fillna(value=0.0)

Xtr_lng = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.lng_region)),
                       shape=(gatrain.shape[0],nlng))
Xte_lng = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.lng_region)),
                        shape=(gatest.shape[0],nlng))
print('Longitude features: train shape {}, test shape {}'.format(Xtr_lng.shape, Xte_lng.shape))

In [None]:
gatrain['lat_region'] = events_latlng['lat_region']
gatest['lat_region'] = events_latlng['lat_region']

#set missing region to 0
gatrain['lat_region'] = gatrain.lat_region.fillna(value=0.0)
gatest['lat_region'] = gatest.lat_region.fillna(value=0.0)

Xtr_lat = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.lat_region)),
                       shape=(gatrain.shape[0],nlat))
Xte_lat = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.lat_region)),
                        shape=(gatest.shape[0],nlat))
print('Latitude features: train shape {}, test shape {}'.format(Xtr_lat.shape, Xte_lat.shape))

In [None]:
# adjustment to timestamp - business hours in west are later
events['timeadj'] = (events['longitude2']- 116.41)*4 # minutes to add to the Beijing time zone
events['timeadj'] = events['timeadj'].fillna(0.0).astype(int)
# west China business starts later than east
events['timestamp2'] = events['timestamp'] + events['timeadj'].values.astype('timedelta64[m]') 

# extract hour of the day from the timestamp 
events['todh'] = events['timestamp2'].map(lambda x : x.hour)
events['tod'] = events['todh'].apply(bintod)

ntod = 4
eventod = (events.groupby(['device_id','tod'])['tod'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left',left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left',left_index=True, right_index=True)
                       .reset_index())
eventod.head()

In [None]:
eventod['size'] = eventod['size'].map(lambda x: np.log(x+1))
t = eventod.dropna(subset=['trainrow'])
Xtr_tod = csr_matrix((t['size'].values, (t.trainrow, t.tod)), 
                      shape=(gatrain.shape[0],ntod))
t = eventod.dropna(subset=['testrow'])
Xte_tod = csr_matrix((t['size'].values, (t.testrow, t.tod)), 
                      shape=(gatest.shape[0],ntod))
print('TOD data: train shape {}, test shape {}'.format(Xtr_tod.shape, Xte_tod.shape))

In [None]:
events["dow"] = events["timestamp"].map(lambda x : x.dayofweek)
events['wkend'] = 0
events.ix[events.dow > 4,'wkend'] = 1

ndow = 2
evendow = (events.groupby(['device_id','wkend'])['wkend'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left',left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left',left_index=True, right_index=True)
                       .reset_index())
evendow.head()

In [None]:
evendow['size'] = evendow['size'].map(lambda x: np.log(x+1))


t = evendow.dropna(subset=['trainrow'])
Xtr_dow = csr_matrix((t['size'].values, (t.trainrow, t.wkend)), 
                      shape=(gatrain.shape[0],ndow))
t = evendow.dropna(subset=['testrow'])
Xte_dow = csr_matrix((t['size'].values, (t.testrow, t.wkend)), 
                      shape=(gatest.shape[0],ndow))
print('TOD data: train shape {}, test shape {}'.format(Xtr_dow.shape, Xte_dow.shape))

In [None]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

In [None]:
#with phone brand and brand-model(first word)
m = phone.phone_brand.str.cat(phone.device_model.str.split().str.get(0))
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

In [None]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())

In [None]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

In [None]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)

labelcat = labelcat.loc[labelcat.label_id.isin(applabels.label_id.unique())]
labelencoder = LabelEncoder().fit(labelcat.category)
labelcat['label'] = labelencoder.transform(labelcat.category)
nlabels = len(labelencoder.classes_)

print('number of unique labels:',nlabels)
print('recoded label categories', '/n',labelcat.head(n=20))

applabels=applabels.merge(labelcat[['label','label_id']],
                          how='left',left_on='label_id',right_on='label_id')

devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])                
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

In [None]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

In [None]:
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label,
                 Xtr_tod, Xtr_dow, Xtr_lat, Xtr_lng), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label,
                 Xte_tod, Xte_dow, Xte_lat, Xte_lng), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

In [None]:
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)

In [None]:
########## XGBOOST ##########

params = {}
params['booster'] = 'gblinear'
params['objective'] = "multi:softprob"
params['eval_metric'] = 'mlogloss'
params['eta'] = 0.005
params['num_class'] = 12
params['lambda'] = 3
params['alpha'] = 2

In [None]:
# Random 10% for validation
kf = list(StratifiedKFold(y, n_folds=10, shuffle=True, random_state=4242))[0]

Xtr, Xte = Xtrain[kf[0], :], Xtrain[kf[1], :]
ytr, yte = y[kf[0]], y[kf[1]]

print('Training set: ' + str(Xtr.shape))
print('Validation set: ' + str(Xte.shape))

d_train = xgb.DMatrix(Xtr, label=ytr)
d_valid = xgb.DMatrix(Xte, label=yte)

watchlist = [(d_train, 'train'), (d_valid, 'eval')]

In [None]:
clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=25)

pred = clf.predict(xgb.DMatrix(Xtest))

pred = pd.DataFrame(pred, index = gatest.index, columns=targetencoder.classes_)
pred.head()
pred.to_csv('sparse_xgb_v11.csv', index=True)