# GENERAL FEATURES FOR KERAS MODELS

In [1]:
import os
import sys
from os import path
import numpy as np
import pandas as pd
from scipy import sparse, io
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [2]:
RAW_DATA_DIR='C:\\Users\\RISHABH\\Documents\\input'

## LOADING DATA

In [15]:
gatrain = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_train.csv'),
                      )
gatest = pd.read_csv(os.path.join(RAW_DATA_DIR,'gender_age_test.csv'),
                     )
phone = pd.read_csv(os.path.join(RAW_DATA_DIR,'phone_brand_device_model.csv'))
# removing duplicate values in phone dataframe so doesn't create problems while joining dataframes
phone=phone.drop_duplicates('device_id',keep='first')

events = pd.read_csv(path.join(RAW_DATA_DIR, 'events.csv'),
                     parse_dates=['timestamp'],
                     infer_datetime_format=True,
                     )

appevents = pd.read_csv(path.join(RAW_DATA_DIR, 'app_events.csv'),
                        dtype={'is_installed':bool, 'is_active':bool})

applabels = pd.read_csv(os.path.join(RAW_DATA_DIR, 'app_labels.csv'))  
labelcat = pd.read_csv(os.path.join(RAW_DATA_DIR, 'label_categories.csv'))

## CREATE HAS_EVENTS FEATURES

In [10]:
s=events.device_id.unique() 
#creating featura has events for train and test
gatrain['has_events']=gatrain.device_id.apply(lambda x:1 if x in s else 0)  
gatest['has_events']=gatest.device_id.apply(lambda x:1 if x in s else 0)

In [16]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0]) 
gatrain=gatrain.merge(phone,on='device_id')  

gatest=gatest.merge(phone,on='device_id')

## BOW OF BRAND

In [13]:
import pickle
brandencoder = LabelEncoder()
brandencoder.fit(np.append(gatrain.phone_brand.values,gatest.phone_brand.values)) 
pickle.dump(brandencoder,open('brandencoder.sav','wb'))
#converting brand to labels
gatrain['phone_brand']=brandencoder.transform(gatrain['phone_brand']) 
gatest['phone_brand']=brandencoder.transform(gatest['phone_brand'])  
#converting labels to int datatype
row=gatrain['phone_brand'].astype(int) 
row2=gatest['phone_brand'].astype(int)
#creating sparse matrix of brand data
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]),
                       (gatrain.trainrow, row)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]),
                       (gatest.testrow, row2)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


## BOW OF MODEL

In [17]:
m = phone.phone_brand.str.cat(phone.device_model)

modelencoder = LabelEncoder().fit(m)
#converting model to labels
pickle.dump(modelencoder,open('modelencoder.sav','wb'))
gatrain['model']=modelencoder.transform(gatrain['phone_brand'].str.cat(gatrain.device_model)) 
gatest['model']=modelencoder.transform(gatest['phone_brand'].str.cat(gatest.device_model)) 
#converting labels to int datatype
row=gatrain['model'].astype(int) 
row2=gatest['model'].astype(int) 
#creating sparse matrix of model data
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]),
                       (gatrain.trainrow, row)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]),
                       (gatest.testrow, row2)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))


Model features: train shape (74645, 1667), test shape (112071, 1667)


## TRAIN AND TEST DATASET FOR NOEVENTS FEATURE

In [50]:
Xtrain_all_brand_model=hstack((Xtr_brand,Xtr_model)) 
Xtrain_events_brand_model=hstack((Xtr_brand[gatrain.index[gatrain.has_events==1],:],Xtr_model[gatrain.index[gatrain.has_events==1],:]))
Xtrain_noevents_brand_model=hstack((Xtr_brand[gatrain.index[gatrain.has_events==0],:],Xtr_model[gatrain.index[gatrain.has_events==0],:]))                                           

In [13]:
Xtest_all_brand_model=hstack((Xte_brand,Xte_model)) 
Xtest_events_brand_model=hstack((Xte_brand[gatest.index[gatest.has_events==1],:],Xte_model[gatest.index[gatest.has_events==1],:]))
Xte_noevents_brand_model=hstack((Xte_brand[gatest.index[gatest.has_events==0],:],Xte_model[gatest.index[gatest.has_events==0],:]))                                           

## BOW FOR APPS

In [19]:
events=events.set_index('event_id') 
gatrain=gatrain.set_index('device_id') 
gatest=gatest.set_index('device_id')

In [20]:
appencoder = LabelEncoder().fit(appevents.app_id)
#converting apps to labels
appevents['app'] = appencoder.transform(appevents.app_id) 
pickle.dump(appencoder,open('appencoder.sav','wb'))
napps = len(appencoder.classes_) 
# finding apps per device id and finding row in train and test associated with device id
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()
# sparse matrix representation of bag of words of apps
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)),
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))



Apps data: train shape (74645, 19237), test shape (112071, 19237)


## BOW FOR LABELS

In [22]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())] 
#converting applabels to labels
applabels['app'] = appencoder.transform(applabels.app_id) 


labelencoder = LabelEncoder().fit(applabels.label_id)
pickle.dump(labelencoder,open('labelcoder.sav','wb'))
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)
# finding labels per device id and finding row in train and test associated with device id
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()
# sparse matrix representation of bag of words of applabels
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


In [16]:
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))


Apps data: train shape (74645, 19237), test shape (112071, 19237)


## TRAIN AND TEST FOR ALL DATA

In [17]:
Xtrain_all = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
Xtest_all =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')

In [18]:
Xtrain_all

<74645x21527 sparse matrix of type '<class 'numpy.float64'>'
	with 2707712 stored elements in Compressed Sparse Row format>

In [62]:
gatest=gatest.reset_index() 
gatrain=gatrain.reset_index()

In [None]:
Xtrain_events=Xtrain_all[gatrain.index[gatrain.has_events==1],:] 
Xtest_events=Xtest_events[gatest.index[gatest.has_events==1],:]

In [None]:
gatrain.index[gatrain.has_events==1]