In [1]:
import os
import sys
from os import path
import numpy as np
import pandas as pd
from scipy import sparse, io
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [3]:
RAW_DATA_DIR='C:\\Users\\RISHABH\\Documents\\input'

## Loading Data

In [4]:
gatrain = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_train.csv'),
                      )
gatest = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_test.csv'),
                     )
phone = pd.read_csv(os.path.join(RAW_DATA_DIR,'phone_brand_device_model.csv'))
# removing duplicate values in phone dataframe so doesn't create problems while joining dataframes
phone=phone.drop_duplicates('device_id',keep='first')

events = pd.read_csv(path.join(RAW_DATA_DIR, 'events.csv'),
                     parse_dates=['timestamp'],
                     infer_datetime_format=True,
                     )

appevents = pd.read_csv(path.join(RAW_DATA_DIR, 'app_events.csv'),
                        dtype={'is_installed':bool, 'is_active':bool})

applabels = pd.read_csv(os.path.join(RAW_DATA_DIR, 'app_labels.csv')) 
folds_5=pd.read_csv(os.path.join(RAW_DATA_DIR, 'folds_5.csv')) 
folds_10=pd.read_csv(os.path.join(RAW_DATA_DIR,'folds_10.csv'))

In [5]:
import scipy 
Xtrain=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_all.npz'))  
Xtest=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtest_all.npz')) 

In [6]:
ytrain = gatrain['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain)

## Hyperparameter tuning for Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV  
from sklearn.pipeline import Pipeline
pipe = Pipeline([('classifier' , LogisticRegression())]) 

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : [ 'l2'],
    'classifier__C' : np.logspace(-3, 1, 10),
    'classifier__solver' : ['lbfgs']},
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, scoring='neg_log_loss', cv = 5, verbose=True, n_jobs=-1) 
best_clf=clf.fit(Xtrain,ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.8min finished


In [26]:
best_clf.best_params_

{'classifier': LogisticRegression(C=0.021544346900318832, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'classifier__C': 0.021544346900318832,
 'classifier__penalty': 'l2',
 'classifier__solver': 'lbfgs'}

In [27]:
best_clf.best_score_

-2.2881861356011615

## Training and Predicitng on best model Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression 
lr=LogisticRegression(C=0.02154,penalty='l2',solver='lbfgs').fit(Xtrain,ytrain) 

p_group=np.zeros((Xtest.shape[0],12)) 
p_group=lr.predict_proba(Xtest)



In [46]:
lab=LabelEncoder()   

# group is target class in train set converting to labels
lab.fit(gatrain['group'].astype(str)) 

pred_test=pd.DataFrame(p_group,index=gatest.device_id,columns=list(lab.classes_)) 
pred_test=pred_test.reset_index() 

pred_test.to_csv('pred_test_lr.csv',index=False)

![title](Documents/input/lr.png)

## Hyperparameter tuning on Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier
param_grid = [
    {'classifier' : [DecisionTreeClassifier()],
    'classifier__min_samples_split' : list(range(2,10,2)),
    'classifier__max_depth' : list(range(6,22,5))}
]
clf = GridSearchCV(pipe, param_grid = param_grid, scoring='neg_log_loss', cv = 5, verbose=True, n_jobs=-1) 
best_clf=clf.fit(Xtrain,ytrain)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  5.2min finished


In [39]:
best_clf.best_params_

{'classifier': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=6,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 'classifier__max_depth': 6,
 'classifier__min_samples_split': 6}

In [42]:
best_clf.best_score_

-2.4332835621797053

## Predicting on best model Dtree

In [47]:
from sklearn.linear_model import LogisticRegression 
lr=DecisionTreeClassifier(max_depth= 6,min_samples_split= 6).fit(Xtrain,ytrain) 

p_group=np.zeros((Xtest.shape[0],12)) 
p_group=lr.predict_proba(Xtest)

In [48]:
pred_test=pd.DataFrame(p_group,index=gatest.device_id,columns=list(lab.classes_)) 
pred_test=pred_test.reset_index() 

pred_test.to_csv('pred_test_dt.csv',index=False)

![title](Documents/input/dt.png)

# XGB

# Making different models for devices with events and devices without events as the data will be significantly different 
## 1. devices with events

In [None]:
s=events.device_id.unique() 
#creating featura has events for train and test
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0)  
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0)

In [None]:
#subsetting data to include only has events==1
gatrain=gatrain[gatrain['has_events']==1] 
gatest=gatest[gatest['has_events']==1]  
#creating row number as feild to help create sparse matrix
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

### Merging data to acess other variables

In [None]:
#merging train and test to get phone details associated with phone 
gatrain=gatrain.merge(phone,on='device_id')  
gatrain=gatrain.merge(folds_5,on='device_id')
gatest=gatest.merge(phone,on='device_id')

In [None]:
lab=LabelEncoder()   

# group is target class in train set converting to labels
gatrain['group']=lab.fit_transform(gatrain['group'].astype(str)) 

gatrain['gender']=lab.fit_transform(gatrain['gender'].astype(str)) 
'''appending phone brand in train and test to fit label encoder
    as the test contains phone brand which are not present in train 
    so not to get data leak we have to append them'''
lab.fit(np.append(gatrain.phone_brand.values,gatest.phone_brand.values))  
#converting phone brand in test and  train
gatrain['phone_brand']=lab.transform(gatrain['phone_brand']) 
gatest['phone_brand']=lab.transform(gatest['phone_brand']) 
#similar to phone brand we do for device model.
lab.fit(np.append(gatrain['device_model'].values,gatest['device_model'].values)) 
gatrain['device_model']=lab.transform(gatrain['device_model']) 
gatest['device_model']=lab.transform(gatest['device_model'])

## I am trying a new technique of using conditional probability in which we first predict gender and then use gender as additional feature to predict the group so the group should be converted to a range of 1 to 6 and divided based on age for both male and female.

## Creating feature age group for applying conditional probability

In [2]:
def getg(x): 
#if gender is female return group as it is already in 1-6 range
    if x['gender']==0: 
        return x['group']
    else:  
#if gender is male return group-6 to make it in 1-6 range.
        return x['group']-6

In [None]:
#apply function to create new feature age_group
gatrain['age_group']=gatrain.apply(getg,axis=1)

In [None]:
#setting index to create features as below
events=events.set_index('event_id') 
gatrain=gatrain.set_index('device_id') 
gatest=gatest.set_index('device_id')

## App feature for device containing number of times app is opened 

In [None]:
#converting apps to labels
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id) 
# create a feature containing number of times the app is opened and finding the row in trainset or test set it belongs to.
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()

In [None]:
## Creating sparse matrix for app feature

In [None]:

napps = len(appencoder.classes_)
# separate train and test subset and create sparse matrixes
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app_inst = csr_matrix( ( d['size'], (d['trainrow'], d['app']) ),
                             shape=(gatrain.shape[0], napps)
                          )

d = deviceapps.dropna(subset=['testrow'])
Xte_app_inst = csr_matrix( (d['size'], (d['testrow'], d['app'])),
                            shape=(gatest.shape[0], napps)
                          )

## Creating app labels feature containing number of times label is used

In [None]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)
# create a feature containing number of times the label is used finding the row in trainset or test set it belongs to.
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

## Creating sparse matrix for label feature

In [None]:


# separate train and test subset and create sparse matrixes
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label_inst = csr_matrix( (d['size'], (d['trainrow'], d['label'])),
                             shape=(gatrain.shape[0], nlabels)
                             )

d = devicelabels.dropna(subset=['testrow'])
Xte_label_inst = csr_matrix( (d['size'], (d['testrow'], d['label'])),
                             shape=(gatest.shape[0], nlabels)
                             )

## Hour feature 

In [None]:
#find hour from timestamp
events['hour'] = events.timestamp.apply(lambda x: x.hour) 
#counting number of times events occur at an hour 
events_cout_hourofday = (events.groupby(['device_id','hour'])['hour'].agg(['size'])
                    .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())  
#creating sparse matrix of hour feature
# separate train and test subset and create sparse matrixes
d = events_cout_hourofday.dropna(subset=['trainrow'])
Xtr_event_on_hourofday = csr_matrix((d['size'], (d.trainrow, d.hour)),
                      shape=(gatrain.shape[0],d.hour.nunique()))

d = events_cout_hourofday.dropna(subset=['testrow'])
Xte_event_on_hourofday = csr_matrix((d['size'], (d.testrow, d.hour)),
                      shape=(gatest.shape[0],d.hour.nunique()))

## Day feature

In [None]:
events['week_day'] = events.timestamp.dt.weekday
#counting number of times events occur at a day 
events_cout_weekday = (events.groupby(['device_id','week_day'])['week_day'].agg(['size'])
                    .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())
d = events_cout_weekday.dropna(subset=['trainrow'])
Xtr_event_on_weekday = csr_matrix((d['size'], (d.trainrow, d['week_day'])),
                      shape=(gatrain.shape[0],d.week_day.nunique()))

d = events_cout_weekday.dropna(subset=['testrow'])
Xte_event_on_weekday = csr_matrix((d['size'], (d.testrow, d['week_day'])),
                      shape=(gatest.shape[0],d.week_day.nunique()))

## latitude and longitude feature

In [None]:
'''longitude and latitude converting (0,0) coordinate which is default coordinate if
location data is not available to np.nan for applying nanmedian'''
events.longitude=events.longitude.apply(lambda x:np.NaN if x==0 else x) 
events.latitude=events.latitude.apply(lambda x:np.NaN if x==0 else x)

In [None]:
# find median latitude and longitude for a device with events.
latitude=events.groupby('device_id')['latitude'].apply(np.nanmedian) 
longitude=events.groupby('device_id')['longitude'].apply(np.nanmedian) 
latitude=latitude.reset_index()
longitude=longitude.reset_index()

In [None]:
#reset index to merge with longitude and latitude
gatrain=gatrain.reset_index()

In [None]:

gatest=gatest.merge(latitude,on='device_id') 
gatest=gatest.merge(longitude,on='device_id') 
te_longitude=gatest.longitude.values.reshape((-1,1)) 
te_latitude=gatest.latitude.values.reshape((-1,1))

In [None]:
gatrain=gatrain.merge(longitude,on='device_id') 
longitude=gatrain.longitude.values

In [None]:
gatrain=gatrain.merge(latitude,on='device_id') 
latitude=gatrain.latitude.values

In [None]:
latitude=latitude.reshape((-1,1)) 
longitude=longitude.reshape((-1,1))

## Combining all feature 

In [None]:
Xtrain=hstack((Xtr_app_inst,Xtr_label_inst,Xtr_event_on_hourofday,Xtr_event_on_weekday,longitude,latitude,gatrain.phone_brand.values.reshape((-1,1)),gatrain.device_model.values.reshape((-1,1)))) 
Xtest=hstack((Xte_app_inst,Xte_label_inst,Xte_event_on_hourofday,Xte_event_on_weekday,te_longitude,te_latitude,gatest.phone_brand.values.reshape((-1,1)),gatest.device_model.values.reshape((-1,1))))

In [None]:
y_gender= gatrain['gender'].values.reshape((-1,1))

In [None]:
nfolds = max(gatrain.fold)
nbags = 5

## Hyperparameter tuning for xgboost


In [None]:
#https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f
from sklearn.metrics import log_loss
import xgboost as xgb
params = {
    # Parameters that we are going to tune.
    'booster':'gbtree',
    'objective':'reg:logistic',
    'eval_metric':'logloss',
    'learning_rate':0.025,
    'max_depth':6,
    'subsample':0.8,
    'colsample_bytree':0.5,
    'colsample_bylevel':0.5 , 
    'min_child_weight':5
}
gridsearch_params = [
    (max_depth,  min_child_weight)
    for max_depth in range(6,10)
    for min_child_weight in range(5,8)
] 
    
 
i=1

num_boost_round=1000
min_logloss = float("Inf")
best_params = None 
dtr = xgb.DMatrix(Xtrain.tocsr(), label = y_gender, missing = np.nan,nthread=-1)

for max_depth,  min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                              min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['colsample_tree'] = colsample_tree
    # Run CV
    cv_results = xgb.cv(
        params,
        dtr,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'logloss'},
        early_stopping_rounds=10, 
    )
    # Update best MAE
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tlogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

## Predicting gender for train using k fold cross validation and bagging


In [None]:
## xgb data 
from sklearn.metrics import log_loss
import xgboost as xgb
p_gender = np.zeros((Xtrain.shape[0],2))  
y_gender= gatrain['gender'].values.reshape((-1,1)) 
# Starting k fold cross validation.
for i in range(1,nfolds+1): 
    #Dividing data in train and test based on fold
    inTr = gatrain.index[gatrain.fold != i]
    inTe = gatrain.index[gatrain.fold == i] 
    y_gender= gatrain['gender'].values.reshape((-1,1)) 
    dtr = xgb.DMatrix(Xtrain.tocsr()[inTr,:], label = y_gender[inTr], missing = np.nan)
    dcv= xgb.DMatrix(Xtrain.tocsr()[inTe,:],label = y_gender[inTe], missing= np.nan) 
    p=np.zeros(len(inTe))
## parameter set
    param = {'booster':'gbtree',
             'objective':'reg:logistic',
             'eval_metric':'logloss',
             'learning_rate':0.025,
             'max_depth':8,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
             'min_child_weight':5} 
    for j in range(nbags):
        print('Iter', i, '- gender\n')
  
  ## train model
        bst_gender = xgb.train(param,
                         dtr,
                         932)
  
  ## prediction
        p =p+ bst_gender.predict(dcv)
        
        del bst_gender 
    p=p/nbags
    prob = np.c_[1-p, p] 
    p_gender[inTe,] = prob
    score = log_loss(y_gender[inTe], prob)
    print('Gender fold', i, '- Score', round(score,6), '\n')

## Function for repeating values

In [None]:
def np_rep(x, reps=1, each=False, length=0):
    """ implementation of functionality of rep() and rep_len() from R

    Attributes:
        x: numpy array, which will be flattened
        reps: int, number of times x should be repeated
        each: logical; should each element be repeated reps times before the next
        length: int, length desired; if >0, overrides reps argument
    """
    if length > 0:
        reps = np.int(np.ceil(length / x.size))
    x = np.repeat(x, reps)
    if(not each):
        x = x.reshape(-1, reps).T.ravel() 
    if length > 0:
        x = x[0:length]
    return(x)

In [None]:
'''we want to repeat each row twice because we are adding gender as additional variable which takes two values 0 and 1  
so we are repeating the indices twice'''
idx = np_rep(np.arange(Xtest.shape[0]), each = True,reps=2)

In [None]:
#repeating each row twice
xtest_mod=Xtest.tocsr()[idx,:]

In [None]:
#repeating gender values till Xtest
g=np_rep(np.arange(2),reps=Xtest.shape[0]) 
g=g.reshape((-1,1)) 
g.shape

In [None]:
xtest_mod=hstack((g,xtest_mod)) 
dtest=xgb.DMatrix(xtest_mod)

## predict group with gender as aditional feature

In [None]:
import random 
random.seed(666) 
y_age_group=gatrain.age_group.values.reshape((-1,1)) 
p_age_group=np.zeros((gatrain.shape[0],12)) 
for i in range(1,nfolds+1): 
    #ddividing in train and test
    inTr=gatrain.index[gatrain.fold!=i] 
    inTe=gatrain.index[gatrain.fold==i]
    #for val add extra feature gender
    idx = np_rep(np.arange(len(inTe)), each = True,reps=2)
    val=Xtrain.tocsr()[inTe,:] 
    val=val[idx,:] 
    g=np_rep(np.arange(2),reps=len(inTe)) 
    g=g.reshape((-1,1)) 
    xtest_mod=hstack((g,val)) 
    #adding extra feature gender for train
    dtr = xgb.DMatrix(hstack((y_gender[inTr], Xtrain.tocsr()[inTr,:])), label = y_age_group[inTr], missing = np.nan,nthread=-1) 
    dte=xgb.DMatrix(xtest_mod,missing=np.nan)
    param = {'booster':'gbtree',
             'objective':'multi:softprob',
             'eval_metric':'mlogloss',
             'num_class':6,
             'learning_rate':0.025,
             'max_depth':8,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
             'min_child_weight':5}

    p=np.zeros((len(inTe),12))

    for j in range(nbags):
        print('Iter', i, '- gender\n')
  
          ## train model
        bst_age_group = xgb.train(param,
                         dtr,
                         932)
  
  ## prediction
        p = p + bst_age_group.predict(dte).reshape(-1,12)
  
        del bst_age_group


    p = p/nbags 
    
    p_age_group[inTe] = p 
    
    print('Age_group fold', i, '- Score', round(score,6), '\n')


## Final prediction using definition of conditional probability for events feature

In [None]:
p_group = np.concatenate((np.multiply(np.divide(p_age_group[...,:6] ,p_age_group[...,0:6].sum(axis=1).reshape(-1,1)), p_gender[...,0].reshape(-1,1)),
                np.multiply(np.divide(p_age_group[...,6:12] ,p_age_group[...,6:12].sum(axis=1).reshape(-1,1)) ,p_gender[...,1].reshape(-1,1))),axis=1)



In [None]:
gatrain = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_train.csv'),
                      ) 
lab=LabelEncoder()   

# group is target class in train set converting to labels
lab.fit(gatrain['group'].astype(str)) 
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0)  
gatrain=gatrain[gatrain['has_events']==1] 
pred_train_events=pd.DataFrame(p_group,index=gatrain.device_id,columns=list(lab.classes_)) 
pred_train_events=pred_train_events.reset_index() 
pred_train_events.head() 
pred_train_events.to_csv('pred_train_events.csv')

## 2. for devices without events

In [None]:
gatrain = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_train.csv'),
                      )
gatest = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_test.csv'),
                     )
phone = pd.read_csv(os.path.join(RAW_DATA_DIR,'phone_brand_device_model.csv'))
# add rownumber = encoding of device_id
phone=phone.drop_duplicates('device_id',keep='first')

In [None]:
gatrain=gatrain.merge(phone,on='device_id') 
gatest=gatest.merge(phone,on='device_id')

## label encoding variables

In [None]:
lab=LabelEncoder()  
gatrain['group']=lab.fit_transform(gatrain['group'].astype(str)) 

gatrain['gender']=lab.fit_transform(gatrain['gender'].astype(str)) 
 
lab.fit(np.append(gatrain.phone_brand.values,gatest.phone_brand.values))  
gatrain['phone_brand']=lab.transform(gatrain['phone_brand']) 
gatest['phone_brand']=lab.transform(gatest['phone_brand']) 
lab.fit(np.append(gatrain['device_model'].values,gatest['device_model'].values)) 
gatrain['device_model']=lab.transform(gatrain['device_model']) 
gatest['device_model']=lab.transform(gatest['device_model'])

In [None]:
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0)  
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0)

## Train and test for noevents data

In [None]:
Xtrain=np.concatenate((gatrain.phone_brand.values.reshape(-1,1),gatrain.device_model.values.reshape(-1,1)),axis=1) 
Xtest=np.concatenate((gatest.phone_brand.values.reshape(-1,1),gatest.device_model.values.reshape(-1,1)),axis=1) 

## Hyperparameter tuning for xgboost for noevents data

In [None]:
from sklearn.metrics import log_loss
import xgboost as xgb
params = {
    # Parameters that we are going to tune.
    'booster':'gbtree',
    'objective':'reg:logistic',
    'eval_metric':'logloss',
    'learning_rate':0.025,
    'max_depth':6,
    'subsample':0.8,
    'colsample_bytree':0.5,
    'colsample_bylevel':0.5 , 
    'min_child_weight':5
}
gridsearch_params = [
    (max_depth,  min_child_weight)
    for max_depth in range(6,10)
    for min_child_weight in range(5,8)
] 
    
 
i=1

num_boost_round=1000
min_logloss = float("Inf")
best_params = None 
dtr = xgb.DMatrix(Xtrain, label = y_gender, missing = np.nan,nthread=-1)

for max_depth,  min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                              min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['colsample_tree'] = colsample_tree
    # Run CV
    cv_results = xgb.cv(
        params,
        dtr,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'logloss'},
        early_stopping_rounds=10, 
    )
    # Update best MAE
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tlogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_logloss))

In [None]:
nbags = 5

y_gender= gatrain['gender'].values.reshape((-1,1))

In [None]:
gatrain=gatrain.merge(folds_5,on='device_id')

In [None]:
p_gender=np.zeros((len(gatrain.index),2))

## Predicting gender using k fold cross validation for no events data


In [None]:
## xgb data 
import xgboost as xgb 
for i in range(1,nfolds+1): 
     
    inTr = gatrain.index[gatrain.fold != i]
    inTe = gatrain.index[(gatrain.fold == i) & (gatrain.has_events==0) ] 
    y_gender= gatrain['gender'].values.reshape((-1,1)) 
    dtr = xgb.DMatrix(Xtrain[inTr,:], label = y_gender[inTr], missing = np.nan)
    dcv= xgb.DMatrix(Xtrain[inTe,:],label = y_gender[inTe], missing= np.nan) 
    p=np.zeros(len(inTe))
## parameter set
    param = {'booster':'gbtree',
             'objective':'reg:logistic',
             'eval_metric':'logloss',
             'learning_rate':0.025,
             'max_depth':9,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
             'min_child_weight':5} 
    for j in range(nbags):
        print('Iter', i, '- gender\n')
  
  ## train model
        bst_gender = xgb.train(param,
                         dtr,
                         1000)
  
  ## prediction
        p =p+ bst_gender.predict(dcv)
        
        del bst_gender 
    p=p/nbags
    prob = np.c_[1-p, p] 
    p_gender[inTe,] = prob
    score = log_loss(y_gender[inTe], prob)
    print('Gender fold', i, '- Score', round(score,6), '\n')
 

In [None]:
p_gender=p_gender[gatrain.index[gatrain.has_events==0],:]

## Adding gender as additional for test

In [None]:
from scipy import sparse
Xtest=Xtest.tocsr()
'''we want to repeat each row twice because we are adding gender as additional variable which takes two values 0 and 1  
so we are repeating the indices twice'''
idx = np_rep(np.arange(Xtest.shape[0]), each = True,reps=2) 
#repeating each row twice 
xtest_mod=Xtest.tocsr()[idx,:]
#repeating gender values till Xtest
g=np_rep(np.arange(2),reps=Xtest.shape[0]) 
g=g.reshape((-1,1)) 
xtest_mod=hstack((g,xtest_mod)) 
dtest=xgb.DMatrix(xtest_mod) 

## Predicting age group using gender as additional feature

In [None]:
import random 
random.seed(666) 
y_age_group=gatrain.age_group.values.reshape((-1,1)) 
p_age_group=np.zeros((Xtrain.shape[0],12)) 
for i in range(1,nfolds+1): 
    inTr=gatrain.index[gatrain.fold!=i] 
    inTe=gatrain.index[(gatrain.fold==i) & (gatrain.has_events==0)] 
    idx = np_rep(np.arange(len(inTe)), each = True,reps=2)
    val=Xtrain[inTe,:] 
    val=val[idx,:] 
    g=np_rep(np.arange(2),reps=len(inTe)) 
    g=g.reshape((-1,1)) 
    xtest_mod=np.concatenate((g,val),axis=1) 
    dtr = xgb.DMatrix(np.concatenate((y_gender[inTr], Xtrain[inTr,:]),axis=1), label = y_age_group[inTr], missing = np.nan) 
    dte=xgb.DMatrix(xtest_mod,missing=np.nan)
    param = {'booster':'gbtree',
             'objective':'multi:softprob',
             'eval_metric':'mlogloss',
             'num_class':6,
             'learning_rate':0.025,
             'max_depth':9,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
             'min_child_weight':5}

    p=np.zeros((len(inTe),12))

    for j in range(nbags):
        print('Iter', i, '- gender\n')
  
          ## train model
        bst_age_group = xgb.train(param,
                         dtr,
                         1060)
  
  ## prediction
        p = p + bst_age_group.predict(dte).reshape(-1,12)
  
        del bst_age_group


    p = p/nbags 
    
    p_age_group[inTe] = p 
    
    print('Age_group fold', i, '- Score', round(score,6), '\n')


## Final prediction using definition of conditional probability for events feature

In [None]:
p_age_group=p_age_group[gatrain.index[gatrain.has_events==0],:]
p_group= np.concatenate((np.multiply(np.divide(p_age_group[...,:6] ,p_age_group[...,0:6].sum(axis=1).reshape(-1,1)), p_gender[...,0].reshape(-1,1)),
                np.multiply(np.divide(p_age_group[...,6:12] ,p_age_group[...,6:12].sum(axis=1).reshape(-1,1)) ,p_gender[...,1].reshape(-1,1))),axis=1)

In [None]:
gatrain = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_train.csv'),
                      ) 
lab=LabelEncoder()   

# group is target class in train set converting to labels
lab.fit(gatrain['group'].astype(str)) 
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0)  
gatrain=gatrain[gatrain['has_events']==0] 
pred_train_noevents=pd.DataFrame(p_group,index=gatrain.device_id,columns=list(lab.classes_)) 
pred_train_noevents=pred_train_noevents.reset_index() 
pred_train_noevents.head() 
pred_train_noevents.to_csv('pred_train_noevents.csv')

## 1. Test with events

## Test and Train for events data

In [None]:
Xtrain=hstack((Xtr_app_inst,Xtr_label_inst,Xtr_event_on_hourofday,Xtr_event_on_weekday,longitude,latitude,gatrain.phone_brand.values.reshape((-1,1)),gatrain.device_model.values.reshape((-1,1)))) 
Xtest=hstack((Xte_app_inst,Xte_label_inst,Xte_event_on_hourofday,Xte_event_on_weekday,te_longitude,te_latitude,gatest.phone_brand.values.reshape((-1,1)),gatest.device_model.values.reshape((-1,1))))

# Predicting gender for test with events

In [None]:
import random 
random.seed(666)  
nbags=10
y_gender=gatrain.gender.values.reshape((-1,1)) 
p_gender=np.zeros((Xtest.shape[0],2))
dtr = xgb.DMatrix( Xtrain, label = y_gender, missing = np.nan) 
dtest=xgb.DMatrix(Xtest)
param = {'booster':'gbtree',
             'objective':'reg:logistic',
             'eval_metric':'logloss',
             'learning_rate':0.025,
             'max_depth':8,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
              'min_child_weight':5}



for i in range(nbags):
    print('Iter', i, '- gender\n')
  
  ## train model
    bst_gender = xgb.train(param,
                         dtr,
                         932)
  
  ## prediction
    p= bst_gender.predict(dtest)
    p_gender = p_gender + np.c_[1-p,p]
  
    del bst_gender


p_gender = p_gender/nbags

## Adding gender as additional for test


In [None]:
from scipy import sparse
Xtest=Xtest.tocsr()
'''we want to repeat each row twice because we are adding gender as additional variable which takes two values 0 and 1  
so we are repeating the indices twice'''
idx = np_rep(np.arange(Xtest.shape[0]), each = True,reps=2) 
#repeating each row twice 
xtest_mod=Xtest.tocsr()[idx,:]
#repeating gender values till Xtest
g=np_rep(np.arange(2),reps=Xtest.shape[0]) 
g=g.reshape((-1,1)) 
xtest_mod=hstack((g,xtest_mod)) 
dtest=xgb.DMatrix(xtest_mod) 

## Predecting age_group for test events data

In [None]:
import random 
random.seed(666) 
y_age_group=gatrain.age_group.values.reshape((-1,1)) 
p_age_group=np.zeros((Xtest.shape[0],12))
dtr = xgb.DMatrix(hstack((y_gender, Xtrain)), label = y_age_group, missing = np.nan,nthread=-1)
param = {'booster':'gbtree',
             'objective':'multi:softprob',
             'eval_metric':'mlogloss',
             'num_class':6,
             'learning_rate':0.025,
             'max_depth':8,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
              'min_child_weight':5}



for i in range(nbags):
    print('Iter', i, '- gender\n')
  
  ## train model
    bst_age_group = xgb.train(param,
                         dtr,
                         932)
  
  ## prediction
    p_age_group = p_age_group + bst_age_group.predict(dtest).reshape(-1,12)
  
    del bst_age_group


p_age_group = p_age_group/nbags

## using definition of conditional probability to calculate p_group

In [None]:

p_group = np.concatenate((np.multiply(np.divide(p_age_group[...,:6] ,p_age_group[...,0:6].sum(axis=1).reshape(-1,1)), p_gender[...,0].reshape(-1,1)),
                np.multiply(np.divide(p_age_group[...,6:12] ,p_age_group[...,6:12].sum(axis=1).reshape(-1,1)) ,p_gender[...,1].reshape(-1,1))),axis=1)



In [None]:
gatest = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_test.csv'),
                      ) 
lab=LabelEncoder()   

# group is target class in train set converting to labels
lab.fit(gatrain['group'].astype(str)) 
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0)  
gatest=gatest[gatest['has_events']==1] 
pred_test_events=pd.DataFrame(p_group,index=gatest.device_id,columns=list(lab.classes_)) 
pred_test_events=pred_test_events.reset_index() 

pred_test_events.to_csv('pred_test_events.csv')
pred_test_events.head() 

## Train and test for noevents data

In [None]:

Xtrain=np.concatenate((gatrain.phone_brand.values.reshape(-1,1),gatrain.device_model.values.reshape(-1,1)),axis=1) 
Xtest=np.concatenate((gatest.phone_brand.values.reshape(-1,1),gatest.device_model.values.reshape(-1,1)),axis=1) 

## Predict gender for noevents data

In [None]:
import random 
random.seed(666)  
nbags=10
y_gender=gatrain.gender.values.reshape((-1,1)) 
p_gender=np.zeros((Xtest.shape[0],2))
dtr = xgb.DMatrix( Xtrain, label = y_gender, missing = np.nan) 
dtest=xgb.DMatrix(Xtest)
param = {'booster':'gbtree',
             'objective':'reg:logistic',
             'eval_metric':'logloss',
             'learning_rate':0.025,
             'max_depth':9,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
              'min_child_weight':5}



for i in range(nbags):
    print('Iter', i, '- gender\n')
  
  ## train model
    bst_gender = xgb.train(param,
                         dtr,
                         932)
  
  ## prediction
    p= bst_gender.predict(dtest)
    p_gender = p_gender + np.c_[1-p,p]
  
    del bst_gender


p_gender = p_gender/nbags


In [None]:
p_gender=p_gender[gatest.index[gatest.has_events==0],:]

## Adding gender as additional for test

In [None]:
from scipy import sparse
Xtest=Xtest.tocsr()
'''we want to repeat each row twice because we are adding gender as additional variable which takes two values 0 and 1  
so we are repeating the indices twice'''
idx = np_rep(np.arange(Xtest.shape[0]), each = True,reps=2) 
#repeating each row twice 
xtest_mod=Xtest.tocsr()[idx,:]
#repeating gender values till Xtest
g=np_rep(np.arange(2),reps=Xtest.shape[0]) 
g=g.reshape((-1,1)) 
xtest_mod=hstack((g,xtest_mod)) 
dtest=xgb.DMatrix(xtest_mod) 

## Predecting age_group for test noevents data

In [None]:
import random 
random.seed(666) 
y_age_group=gatrain.age_group.values.reshape((-1,1)) 
p_age_group=np.zeros((Xtest.shape[0],12))
dtr = xgb.DMatrix(np.concatenate((y_gender, Xtrain),axis=1), label = y_age_group, missing = np.nan,nthread=-1)
param = {'booster':'gbtree',
             'objective':'multi:softprob',
             'eval_metric':'mlogloss',
             'num_class':6,
             'learning_rate':0.025,
             'max_depth':9,
             'subsample':0.8,
             'colsample_bytree':0.5,
             'colsample_bylevel':0.5,
              'min_child_weight':5}



for i in range(nbags):
    print('Iter', i, '- gender\n')
  
  ## train model
    bst_age_group = xgb.train(param,
                         dtr,
                         932)
  
  ## prediction
    p_age_group = p_age_group + bst_age_group.predict(dtest).reshape(-1,12)
  
    del bst_age_group


p_age_group = p_age_group/nbags

## using definition of conditional probability to calculate p_group


In [None]:
p_group = np.concatenate((np.multiply(np.divide(p_age_group[...,:6] ,p_age_group[...,0:6].sum(axis=1).reshape(-1,1)), p_gender[...,0].reshape(-1,1)),
                np.multiply(np.divide(p_age_group[...,6:12] ,p_age_group[...,6:12].sum(axis=1).reshape(-1,1)) ,p_gender[...,1].reshape(-1,1))),axis=1)



In [None]:
gatest = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_test.csv'),
                      ) 
lab=LabelEncoder()   

# group is target class in train set converting to labels
lab.fit(gatrain['group'].astype(str)) 
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0)  
gatest=gatest[gatest['has_events']==0] 
pred_test_noevents=pd.DataFrame(p_group,index=gatest.device_id,columns=list(lab.classes_)) 
pred_test_noevents=pred_test_noevents.reset_index() 

pred_test_noevents.to_csv('pred_test_noevents.csv')
pred_test_noevents.head() 

In [None]:

pred_test=pd.concat([pred_test_noevents,pred_test_events]) 
xgb_sub=gatest.merge(pred_test,on='device_id') 

xgb_sub.to_csv('xgb_sub.csv',index=False)

![title](Documents/input/xgb.png)

## Now trying different Keras NN models

## Batch Generator for training

In [4]:
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0


# Batch generator for validation and testing

In [None]:
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

## Loading train data and prediction variabl

In [None]:
import scipy 
Xtrain=scipy.sparse.load_npz('Xtrain_all.npz') 
train=pd.read_csv('gender_age_train.csv') 
ytrain = train['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain) 
folds = pd.read_csv('folds_5.csv')['fold'].values

## model for training

In [None]:
def nn_model():
    # create model
    model = Sequential()
    model.add(Dense(150, input_dim=Xtrain.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=Xtrain.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return(model)

## cv params
nfolds = np.max(folds)
nbags = 1

p_group = np.zeros((Xtrain.shape[0], 12))

## Prediction on training set

In [None]:
for i in range(1, nfolds+1):
    ## cv index
    inTr = [idx for idx, fold in enumerate(folds) if fold != i]
    inTe = [idx for idx, fold in enumerate(folds) if fold == i]
    ## train data
    xtr = Xtrain[inTr]
    ytr = ytrain[inTr]
    ## validation data
    xval = Xtrain[inTe]
    yval = ytrain[inTe]
    ## object to store predictions
    pred = np.zeros((xval.shape[0], 12))
    for j in range(nbags):
        model = nn_model()
        ## training
        fit = model.fit_generator(generator = batch_generator(xtr,ytr, 400, True),
                                  nb_epoch = 18,
                                  samples_per_epoch = np.ceil(69984/400),
                                  verbose = 0)
        ## prediction
        pred += model.predict_generator(generator = batch_generatorp(xval, 800, False), val_samples = xval.shape[0])
    ## average predictions
    pred /= nbags
    p_group[inTe] = pred
    score = log_loss(yval, pred)
    print('Fold ', i, '-', score, '\n')

score = log_loss(Y, p_group)
print('Total score', score)


## Loading test set

In [None]:
Xtest=scipy.sparse.load_npz('Xtest_all.npz') 

In [None]:
nbags = 10

p_group = np.zeros((Xtest.shape[0], 12))

## Prediction on test set

In [None]:
for i in range(nbags):
    ## cv index
    
    ## train data
    xtr = Xtrain
    ytr = ytrain
    ## validation data
    xval = Xtest
   
    ## object to store predictions

    model = nn_model()
        ## training
    fit = model.fit_generator(generator = batch_generator(xtr,ytr, 400, True),
                                  nb_epoch = 18,
                                  samples_per_epoch = np.ceil(69984/400),
                                  verbose = 2)
        ## prediction
    p_group += model.predict_generator(generator = batch_generatorp(xval, 800, False), val_samples = xval.shape[0])
    ## average predictions
    
    

p_group /= nbags



In [None]:
test=pd.read_csv('gender_age_test.csv')
targetencoder=LabelEncoder() 
targetencoder.fit_transform(train.group)
cols=list(targetencoder.classes_) 
cols=['device_id']+cols  
pred=pd.DataFrame(data=p_group,index=test.device_id) 
pred=pred.reset_index() 
pred.columns=cols 
pred.to_csv('pred_keras_test_bag10.csv')

## Second Keras model 

## In this we are training different models for data with events and data without events

## Loading train data and prediction variable fo events data


In [None]:
import scipy 
Xtrain_events=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_events.npz')) 
s=events.device_id.unique() 
#creating featura has events for train and test
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0) 
gatrain=gatrain[gatrain['has_events']==1]
ytrain = gatrain['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain)

In [None]:
#merging with fold file containing fold information
gatrain=gatrain.merge(folds_5,on='device_id')

## model for training

In [None]:
def nn_model():
  model = Sequential()
  model.add(Dense(200, input_dim = Xtrain_events.shape[1], init = 'he_normal'))
  model.add(PReLU())
  model.add(Dropout(0.4))
  model.add(Dense(100, init = 'he_normal'))
  model.add(PReLU())
  model.add(Dropout(0.2))
  model.add(Dense(12, init = 'he_normal', activation = 'softmax'))
  adagrad = Adagrad(lr = 0.005, epsilon = 1e-08)
  model.compile(loss = 'sparse_categorical_crossentropy', optimizer = adagrad, metrics = ['accuracy'])
  return(model)

## cv params
nfolds = 5
nbags = 5

p_group = np.zeros((Xtrain_events.shape[0], 12))

## Prediction on training set for events data

In [None]:
for i in range(1, nfolds+1):
    ## cv index
    inTr=gatrain.index[gatrain.fold!=i] 
    inTe=gatrain.index[gatrain.fold==i]
    ## train data
    xtr = Xtrain_events[inTr]
    ytr = ytrain[inTr]
    ## validation data
    xval = Xtrain_events[inTe]
    yval = ytrain[inTe]
    ## object to store predictions
    pred = np.zeros((xval.shape[0], 12))
    for j in range(nbags):
        model = nn_model()
        ## training
        fit = model.fit_generator(generator = batch_generator(xtr,ytr, 200, True),
                                  nb_epoch = 80,
                                  steps_per_epoch = 5,
                                  verbose = 0)
        ## prediction
        pred += model.predict_generator(generator = batch_generatorp(xval, 800, False), steps = xval.shape[0]/800)
    ## average predictions
    pred /= nbags
    p_group[inTe] = pred
    score = log_loss(yval, pred)
    print('Fold ', i, '-', score, '\n')

score = log_loss(Y, p_group)
print('Total score', score)


In [None]:
pred_train_events=pd.DataFrame(p_group,index=gatrain.device_id[gatrain.has_events==1]) 

pred_train_events.to_csv('pred_train_keras2_events.csv')

## Loading train data and prediction variable for noevents 

In [None]:
Xtrain=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_all_brand_model.npz')) 
s=events.device_id.unique() 
#creating featura has events for train and test
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0) 
gatrain=gatrain[gatrain['has_events']==0]
ytrain = gatrain['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain) 
p_group = np.zeros((Xtrain.shape[0], 12))

## Prediction on training set for noevents

In [None]:
for i in range(1, nfolds+1):
    ## cv index
    inTr=gatrain.index[gatrain.fold!=i] 
    inTe=gatrain.index[gatrain.fold==i]
    ## train data
    xtr = Xtrain[inTr]
    ytr = ytrain[inTr]
    ## validation data
    xval = Xtrain[inTe]
    yval = ytrain[inTe]
    ## object to store predictions
    pred = np.zeros((xval.shape[0], 12))
    for j in range(nbags):
        model = nn_model()
        ## training
        fit = model.fit_generator(generator = batch_generator(xtr,ytr, 400, True),
                                  nb_epoch = 30,
                                  steps_per_epoch = 40000/400,
                                  verbose = 0)
        ## prediction
        pred += model.predict_generator(generator = batch_generatorp(xval, 800, False), steps = xval.shape[0]/800)
    ## average predictions
    pred /= nbags
    p_group[inTe] = pred
    score = log_loss(yval, pred)
    print('Fold ', i, '-', score, '\n')

score = log_loss(ytrain, p_group)
print('Total score', score)

In [None]:
#subsetting predictions which have no events in them
p_group=p_group[gatrain.index[gatrain.has_events==0]]
pred_train_noevents=pd.DataFrame(p_group,index=gatrain.device_id[gatrain.has_events==0]) 

pred_train_noevents.to_csv('pred_train_keras2_noevents.csv')

## Now predicting on test

## Loading train data,test data and prediction variable for events data

In [None]:
import scipy 
Xtrain=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_events.npz'))  
Xtest=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtest_events.npz')) 


In [None]:

s=events.device_id.unique() 
#creating featura has events for train and test
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0) 
gatrain=gatrain[gatrain.has_events==1]
ytrain = gatrain['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain)

## Prediction on testing set for events data

In [None]:
or i in range(nbags):
    ## cv index

    ## train data
    xtr = Xtrain
    ytr = ytrain
    ## validation data
    xval = Xtest
    
    ## object to store predictions

    
    model = nn_model()
        ## training
    fit = model.fit_generator(generator = batch_generator(xtr,ytr, 200, True),
                                  nb_epoch = 80,
                                  samples_per_epoch = 5,
                                  verbose = 0)
        ## prediction
    p_group += model.predict_generator(generator = batch_generatorp(xval, 800, False), steps = xval.shape[0]/800)
    ## average predictions
p_group /= nbags
  

In [None]:
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0) 
gatest=gatest[gatest.has_events==1]
pred_test_events=pd.DataFrame(p_group,index=gatest.device_id) 

pred_test_events.to_csv('pred_test_keras2_events.csv')

## Loading train data and prediction variable for noevents

In [None]:
import scipy 
Xtrain=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_all_brand_model.npz'))  
Xtest=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtest_all_brand_model.npz')) 

In [None]:
Xtest=Xtest.tocsr() 
Xtrain=Xtrain.tocsr()

In [None]:
ytrain = gatrain['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain)

In [None]:
#predicting on no events data so only retaining those values
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0) 
Xtest=Xtest[gatest.index[gatest['has_events']==0],:]
gatest=gatest[gatest.has_events==0] 
nbags = 10

p_group = np.zeros((gatest.shape[0], 12))

## Prediction on testing set for noevents data

In [None]:
for i in range(nbags):
    ## cv index

    ## train data
    xtr = Xtrain
    ytr = ytrain
    ## validation data
    xval = Xtest
    
    ## object to store predictions

    
    model = nn_model()
        ## training
    fit = model.fit_generator(generator = batch_generator(xtr,ytr, 200, True),
                                  nb_epoch = 80,
                                  samples_per_epoch = 5,
                                  verbose = 0)
        ## prediction
    p_group += model.predict_generator(generator = batch_generatorp(xval, 800, False), steps = xval.shape[0]/800)
    ## average predictions
p_group /= nbags
  


In [None]:
pred_test_noevents=pd.DataFrame(p_group,index=gatest.device_id) 

pred_test_noevents.to_csv('pred_test_keras2_noevents.csv')

![title](Documents/input/keras2.png)

## 3rd keras Model

## Loading train data,test data and prediction variable

In [None]:
import scipy 
Xtrain=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_all.npz'))  
Xtest=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtest_all.npz')) 

In [None]:
ytrain = gatrain['group']
label_group = LabelEncoder()
ytrain = label_group.fit_transform(ytrain)  
gatrain=gatrain.merge(folds_5,on='device_id')
nfolds = np.max(folds_5)
nbags = 5 
p_group = np.zeros((Xtrain.shape[0], 12))

## Prediction on training set

In [None]:
for i in range(1, nfolds+1):
    ## cv index
    inTr=gatrain.index[gatrain.fold!=i] 
    inTe=gatrain.index[gatrain.fold==i]
    ## train data
    xtr = Xtrain[inTr]
    ytr = ytrain[inTr]
    ## validation data
    xval = Xtrain[inTe]
    yval = ytrain[inTe]
    ## object to store predictions
    pred = np.zeros((xval.shape[0], 12))
    for j in range(nbags):
        model = nn_model()
        ## training
        fit = model.fit_generator(generator = batch_generator(xtr,ytr, 200, True),
                                  nb_epoch = 200,
                                  steps_per_epoch = 5,
                                  verbose = 0)
        ## prediction
        pred += model.predict_generator(generator = batch_generatorp(xval, 800, False), steps = xval.shape[0]/800)
    ## average predictions
    pred /= nbags
    p_group[inTe] = pred
    score = log_loss(yval, pred)
    print('Fold ', i, '-', score, '\n')

score = log_loss(ytrain, p_group)
print('Total score', score)

In [None]:
pred_train_keras3=pd.DataFrame(p_group,index=gatrain.device_id) 

pred_train_keras3.to_csv('pred_train_keras3.csv')

In [None]:
p_group=np.zeros((Xtest.shape[0],12))

## Prediction on testing set

In [None]:
nbags=10
for i in range(nbags):
    ## cv index

    ## train data
    xtr = Xtrain
    ytr = ytrain
    ## validation data
    xval = Xtest
    
    ## object to store predictions

    
    model = nn_model()
        ## training
    fit = model.fit_generator(generator = batch_generator(xtr,ytr, 200, True),
                                  nb_epoch = 200,
                                  samples_per_epoch = 5,
                                  verbose = 0)
        ## prediction
    p_group += model.predict_generator(generator = batch_generatorp(xval, 800, False), steps = xval.shape[0]/800)
    ## average predictions
p_group /= nbags

In [None]:
pred_test_keras3=pd.DataFrame(p_group,index=gatest.device_id) 

pred_test_keras3.to_csv('pred_test_keras3.csv')

![title](Documents/input/keras3.png)

## 4th Keras model

## Loading train data,test data

In [None]:
import scipy 
Xtrain=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_all.npz'))  
Xtest=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtest_all.npz')) 

## New feature Model frequency

In [None]:
model_freq = phone["device_model"].value_counts().to_frame()
mf_encoder = LabelEncoder().fit(model_freq.device_model) 
# creating new feature model_frequency
model_freq['model_freq']=mf_encoder.transform(model_freq['device_model'])

# merging with train and test
gatrain=gatrain.merge(model_freq, how='left', left_on="device_model", right_index=True)
gatest=gatest.merge(model_freq, how='left', left_on="device_model", right_index=True)
gatest["model_freq"]=gatest["model_freq"].fillna(1) # fill not found frequencies with 1
#creating sparse matrix for the model frequency

Xtr_model_freq = csr_matrix((np.ones(gatrain.shape[0]),
                       (gatrain.trainrow, gatrain["model_freq"])))
Xte_model_freq = csr_matrix((np.ones(gatest.shape[0]),
                       (gatest.testrow, gatest["model_freq"])))

print('Model frequency features: train shape {}, test shape {}'.format(Xtr_model_freq.shape, Xte_model_freq.shape))

## New feature brand frequency

In [None]:
brand_freq = phone["phone_brand"].value_counts().to_frame()
bf_encoder = LabelEncoder().fit(brand_freq.phone_brand)
# creating new feature brand_frequency
brand_freq['brand_freq']=bf_encoder.transform(brand_freq['phone_brand'])

# merging with train and test
gatrain=gatrain.merge(brand_freq, how='left', left_on="phone_brand", right_index=True)
gatest=gatest.merge(brand_freq, how='left', left_on="phone_brand", right_index=True)
gatest["brand_freq"]=gatest["brand_freq"].fillna(1) # fill not found frequencies with 1
#creating sparse matrix for the brand frequency
Xtr_brand_freq = csr_matrix((np.ones(gatrain.shape[0]),
                       (gatrain.trainrow, gatrain.brand_freq)))

Xte_brand_freq = csr_matrix((np.ones(gatest.shape[0]),
                       (gatest.testrow, gatest.brand_freq)))

print('Brand frequency features: train shape {}, test shape {}'.format(Xtr_brand_freq.shape, Xte_brand_freq.shape))

## Number of events feature

In [None]:
# creating number of events feature and scaling it to 0 to 1 range 
events_cout = (events.groupby('device_id')['timestamp'].agg(['size'])
                    .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())
events_cout.size = (np.log((events_cout['size'])))
events_cout.size = events_cout.size/events_cout.size.max()
#sparse matrix of events feature
d = events_cout.dropna(subset=['trainrow'])
Xtr_eventsize = csr_matrix((d.iloc[:,1], (d.trainrow, np.zeros(d.shape[0]))),
                      shape=(gatrain.shape[0],1))

d = events_cout.dropna(subset=['testrow'])
Xte_eventsize = csr_matrix((d.iloc[:,1], (d.testrow, np.zeros(d.shape[0]))),
                      shape=(gatest.shape[0],1))
print('Labels data: train shape {}, test shape {}'.format(Xtr_eventsize.shape, Xte_eventsize.shape))

In [None]:
events=events.set_index('event_id')
gatrain=gatrain.set_index('device_id') 
gatest=gatest.set_index('device_id')

## Hour feature bag of words

In [None]:
# #find hour from timestamp 
events['hour'] = events.timestamp.apply(lambda x: x.hour) 
#counting number of times events occur at an hour
events_cout_hourofday = (events.groupby(['device_id','hour'])['hour'].agg(['size'])
                    .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())
d = events_cout_hourofday.dropna(subset=['trainrow']) 
#creating sparse matrix of hour feature
# separate train and test subset and create sparse matrixes
Xtr_event_on_hourofday = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.hour)),
                      shape=(gatrain.shape[0],d.hour.nunique()))

d = events_cout_hourofday.dropna(subset=['testrow'])
Xte_event_on_hourofday = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.hour)),
                      shape=(gatest.shape[0],d.hour.nunique()))
print('Labels data: train shape {}, test shape {}'.format(Xtr_event_on_hourofday.shape, Xte_event_on_hourofday.shape))
# #find hour from timestamp 
events['hour'] = events.timestamp.apply(lambda x: x.hour) 
#counting number of times events occur at an hour
events_cout_hourofday = (events.groupby(['device_id','hour'])['hour'].agg(['size'])
                    .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())
d = events_cout_hourofday.dropna(subset=['trainrow']) 
#creating sparse matrix of hour feature
# separate train and test subset and create sparse matrixes
Xtr_event_on_hourofday = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.hour)),
                      shape=(gatrain.shape[0],d.hour.nunique()))

d = events_cout_hourofday.dropna(subset=['testrow'])
Xte_event_on_hourofday = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.hour)),
                      shape=(gatest.shape[0],d.hour.nunique()))
print('Labels data: train shape {}, test shape {}'.format(Xtr_event_on_hourofday.shape, Xte_event_on_hourofday.shape))


## tf-idf feature

In [None]:
app_lab = applabels.groupby("app_id")["label_id"].apply(
    lambda x: " ".join(str(s) for s in x))

#joining all applabels together and creating a feature
print("# Read App Events") 
appevents['app_lab'] = np.nan
appevents["app_lab"] = appevents["app_id"].map(app_lab)
appevents = appevents.groupby("event_id")["app_lab"].apply(
    lambda x: " ".join(str(s) for s in x))

In [None]:
events=events.reset_index()
events['app_lab']=np.nan
events["app_lab"] = events["event_id"].map(appevents) 
#all app labels are joined together for one device
events = events.groupby("device_id")["app_lab"].apply(
    lambda x: " ".join(str(s) for s in x))

In [None]:
def get_hash_data(train, test):
    df = pd.concat((train, test), axis=0, ignore_index=True)
    split_len = len(train)

    # TF-IDF Feature
    tfv = TfidfVectorizer(min_df=1)
    df = df[["phone_brand", "device_model", "app_lab"]].astype(np.str).apply(
        lambda x: " ".join(s for s in x), axis=1).fillna("Missing")
    df_tfv = tfv.fit_transform(df)

    train = df_tfv[:split_len, :]
    test = df_tfv[split_len:, :]
    return train, test

def get_hash_data2(train, test):
    df = pd.concat((train, test), axis=0, ignore_index=True)
    split_len = len(train)

    # TF-IDF Feature
    tfv = TfidfVectorizer(min_df=1)
    df = df[["phone_brand", "device_model"]].astype(np.str).apply(
        lambda x: " ".join(s for s in x), axis=1).fillna("Missing")
    df_tfv = tfv.fit_transform(df)

    train = df_tfv[:split_len, :]
    test = df_tfv[split_len:, :]
    return train, test

In [None]:
trainrow = np.arange(gatrain.shape[0])
testrow = np.arange(gatest.shape[0])
superrow= np.arange(gatrain.shape[0]+ gatest.shape[0])


#bags for all data
train_bag, test_bag = get_hash_data(gatrain,gatest)

#bags only brand and model:
train_bag2, test_bag2 = get_hash_data2(gatrain,gatest)


del gatrain
del gatest


## Concat all features

In [None]:
Xtrain = hstack((Xtrain, Xtr_brand_freq, Xtr_model_freq,Xtr_eventsize,Xtr_event_on_hourofday,
                 train_bag), format='csr')
Xtest =  hstack((Xtest, Xte_brand_freq, Xte_model_freq,Xte_eventsize,Xte_event_on_hourofday,
                 test_bag,), format='csr')
Xtrain_bm=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtrain_all_brand_model.npz')) 
Xtest_bm=scipy.sparse.load_npz(os.path.join(RAW_DATA_DIR,'Xtest_all_brand_model.npz'))

Xtrain_ne = hstack((Xtrain_bm,Xtr_brand_freq, Xtr_model_freq, train_bag2), format='csr')
Xtest_ne =  hstack((Xtest_bm,Xte_brand_freq, Xte_model_freq, test_bag2), format='csr')

print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

# Reduce dimensionality
indices = np.nonzero(Xtrain)
columns_non_unique = indices[1]
unique_columns = sorted(set(columns_non_unique))
Xtrain=Xtrain.tocsc()[:,unique_columns]
Xtest=Xtest.tocsc()[:,unique_columns]

print('All features after dimensionality reduction: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))


In [None]:
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)

##Keras stuff
dummy_y = np_utils.to_categorical(y)

## model for training

In [None]:
def baseline_model(num_columns):
    # create model
    model = Sequential()
    model.add(Dropout(0.4, input_shape=(num_columns,)))
    model.add(Dense(75))
    model.add(PReLU())
    model.add(Dropout(0.30))
    model.add(Dense(50, init='normal', activation='tanh'))
    model.add(PReLU())
    model.add(Dropout(0.20))

    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
    return model 

folds= pd.read_csv(os.path.join(RAW_DATA_DIR,'folds_10.csv')) 
pred = np.zeros((y.shape[0],nclasses*2))
pred_test = np.zeros((gatest.shape[0],nclasses*2))
n_folds=len(folds["fold"].unique())

In [None]:
s=events.device_id.unique() 
#creating featura has events for train and test
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0)  
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0) 
gatrain=gatrain.merge(folds,on='device_id')

In [None]:
def intersection(lst1, lst2): 
  
    # Use of hybrid method 
    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3 

## Predicting on train and test using Logistic Regression and Keras

In [None]:
for fold_id in range(1, n_folds + 1):
    #fold_id=1
    inTr=gatrain.index[gatrain.fold!=fold_id] 
    inTe=gatrain.index[gatrain.fold==fold_id]

    # With no events
    train_id_ne = intersection(list(inTr), list(gatrain.index[gatrain["has_events"]==0]))
    valid_id_ne = intersection(list(inTe),list(gatrain.index[gatrain["has_events"]==0]))
    test_id_ne = list(gatest.index[gatest['has_events']==0])

    # With events: Training using only common features
    train_id_we = intersection(list(inTr), list(gatrain.index[gatrain["has_events"]==1]))
    valid_id_we = intersection(list(inTe), list(gatrain.index[gatrain["has_events"]==1]))
    test_id_we = gatest.index[gatest['has_events']==1]

    # First, train on all data, but only no-events feature. Validate with no events:
    Xtr, Ytr = Xtrain_ne[train_id, :], y[train_id]
    Xva, Yva = Xtrain_ne[valid_id_ne, :], y[valid_id_ne]

    # Logistic regression >
    clf1 = LogisticRegression(C=0.06, multi_class='multinomial', solver='lbfgs')  # 2.38715733092
    # Fitting logistic regression 1
    clf1.fit(Xtr, Ytr)

    # Predicting only in those with no events!
    pred[valid_id_ne, 0:12] = clf1.predict_proba(Xva)
    pred_test[test_id_ne, 0:12] = pred_test[test_id_ne, 0:12] + clf1.predict_proba(Xtest_ne[test_id_ne, :])

    score_val = log_loss(Yva, pred[valid_id_ne, 0:12])
    print("No-events: Logistic logloss for fold {} is {}".format(fold_id, score_val))

    # 2.- After, train only rows with events
    Xtr, Ytr = Xtrain[train_id_we, :], y[train_id_we]
    Xva, Yva = Xtrain[valid_id_we, :], y[valid_id_we]

    clf2 = LogisticRegression(C=0.016, multi_class='multinomial', solver='lbfgs')  # 1.99914889909
    clf2.fit(Xtr, Ytr)

    # Predicting only in those with events!
    pred[valid_id_we, 0:12] = clf2.predict_proba(Xva)
    pred_test[test_id_we, 0:12] = pred_test[test_id_we, 0:12] + clf2.predict_proba(Xtest[test_id_we, :])

    score_val = log_loss(Yva, pred[valid_id_we, 0:12])
    print("With-events: Logistic logloss for fold {} is {}".format(fold_id, score_val))

    Xva, Yva = Xtrain[valid_id, :], y[valid_id]
    score_val = log_loss(Yva, pred[valid_id, 0:12])
    print("Total: Logistic logloss for fold {} is {}".format(fold_id, score_val))

    ## Fitting Keras! ------------------------------------------------------------------>
    # First, train on all data, but only no-events feature. Validate with no events:
    Xtr, Ytr_dum = Xtrain_ne[train_id, :], dummy_y[train_id]
    Xva, Yva_dum = Xtrain_ne[valid_id_ne, :], dummy_y[valid_id_ne]

    model = baseline_model(Xtr.shape[1])
    fit = model.fit_generator(generator=batch_generator(Xtr, Ytr_dum, 381, True),
                              nb_epoch=20,
                              steps_per_epoch=Xtr.shape[0]/381, verbose=2,
                              validation_data=(Xva.todense(), Yva_dum)
                              )

    # evaluate the model
    pred[valid_id_ne, 12:25] = model.predict_generator(generator=batch_generatorp(Xva, 400, False),
                                                       steps=Xva.shape[0]/400)
    pred_test[test_id_ne, 12:25] = pred_test[test_id_ne, 12:25] + \
                                   model.predict_generator(
                                       generator=batch_generatorp(Xtest_ne[test_id_ne, :], 400, False),
                                       steps=Xtest_ne[test_id_ne, :].shape[0]/400)

    # 2.- After, train all data (keras)
    Xtr, Ytr_dum = Xtrain[train_id, :], dummy_y[train_id]
    Xva, Yva_dum = Xtrain[valid_id_we, :], dummy_y[valid_id_we]

    model = baseline_model(Xtr.shape[1])
    fit = model.fit_generator(generator=batch_generator(Xtr, Ytr_dum, 381, True),
                              nb_epoch=20,
                              steps_per_epoch=Xtr.shape[0]/381, verbose=2,
                              validation_data=(Xva.todense(), Yva_dum)
                              )

    # evaluate the model, and predict only with events:
    pred[valid_id_we, 12:25] = model.predict_generator(generator=batch_generatorp(Xva, 400, False),
                                                       steps=Xva.shape[0]/400)
    pred_test[test_id_we, 12:25] = pred_test[test_id_we, 12:25] + \
                                   model.predict_generator(generator=batch_generatorp(Xtest[test_id_we, :], 400, False),
                                                           steps=Xtest[test_id_we, :].shape[0]/400)

    # pred_test[test_id_ne,0:12] = pred_test[test_id_ne,0:12] + clf1.predict_proba(Xtest_ne[test_id_ne, :])

    Xva, Yva = Xtrain[valid_id, :], y[valid_id]
    score_val = log_loss(Yva, pred[valid_id, 12:25])
    print("Total: Keras logloss for fold {} is {}".format(fold_id, score_val))

In [None]:
##Averaging predictions for all folds in the test set
pred_test /= float(n_folds) 
pred_test_mix=pred_test[:,12:25]
pred_test_mix[test_id_ne,0:12]=pred_test[test_id_ne,0:12]
pred_test_keras4=pd.DataFrame(pred_test_mix,index=gatest.device_id) 

pred_test_keras4.to_csv('pred_test_cv10_keras4.csv')

In [None]:
 
pred_mix=pred[:,12:25]
pred_mix[train_id_ne,0:12]=pred[train_id_ne,0:12]
pred_train_keras4=pd.DataFrame(pred_mix,index=gatrain.device_id) 

pred_train_keras4.to_csv('pred_train_cv10_keras4.csv')
pred_test_keras4.to_csv('pred_test_cv10_keras4.csv')

In [None]:
 
pred_test_keras4.columns=['device_id']+list(targetencoder.classes_)
pred_test_keras4.to_csv('pred_test_cv10_keras4.csv',index=False)

![title](Documents/input/keras4.png)