In [18]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder,StandardScaler, OneHotEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation,BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD,Adagrad
from keras.layers.advanced_activations import PReLU
from sklearn.cross_validation import train_test_split
from keras.callbacks import EarlyStopping,TensorBoard
from statistics import mean
from keras.models import load_model

In [3]:
gatrain = pd.read_csv("data/gender_age_train.csv",index_col='device_id')
gatest = pd.read_csv("data/gender_age_test.csv",index_col='device_id')
phone=pd.read_csv("data/phone_brand_device_model.csv")
app_label=pd.read_csv('data/app_labels.csv')
label_cat=pd.read_csv("data/label_categories.csv")
app_events=pd.read_csv("data/app_events.csv", dtype={'is_active':bool})
events = pd.read_csv('data/events.csv',  parse_dates=['timestamp'],index_col='event_id')

In [4]:
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id') #removing duplicate device id's

In [27]:
def pipeline(gatrain,gatest,phone,app_label,label_cat,app_events,events,file):
    start=datetime.now()
    mask=np.in1d(gatrain.index,events["device_id"].values)
    gatrain_events= gatrain[mask]
    mask=np.in1d(gatest.index,events["device_id"].values)
    gatest_events= gatest[mask]
    mask=np.in1d(gatrain.index,events["device_id"].values,invert=True)
    gatrain_noevents= gatrain[mask]
    mask=np.in1d(gatest.index,events["device_id"].values,invert=True)
    gatest_noevents= gatest[mask]

    ##############################DATA Engineering##########################################
    gatrain['trainrow'] = np.arange(gatrain.shape[0])
    gatest['testrow'] = np.arange(gatest.shape[0])
    gatrain_events['trainrow']=np.arange(gatrain_events.shape[0])
    gatest_events['testrow']=np.arange(gatest_events.shape[0])
    gatrain_noevents['trainrow']=np.arange(gatrain_noevents.shape[0])
    gatest_noevents['testrow']=np.arange(gatest_noevents.shape[0])

    brandencoder = LabelEncoder().fit(phone.phone_brand)
    phone['brand'] = brandencoder.transform(phone['phone_brand'])
    nbrand=len(brandencoder.classes_)
    m = phone.phone_brand.str.cat(phone.device_model)
    #m=phone['phone_brand'].str.cat(phone['device_model'])
    modelencoder = LabelEncoder().fit(m)
    phone['model'] = modelencoder.transform(m)
    nmodel=len(modelencoder.classes_)

    appencoder = LabelEncoder().fit(app_events['app_id'])
    app_events['app'] = appencoder.transform(app_events['app_id'])
    napps = len(appencoder.classes_)
    deviceapps = (app_events.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                           .groupby(['device_id','app'])['app'].agg(['size'])# grouping by device id and app and finding size of app
                           .merge(gatrain_events[['trainrow']], how='left', left_index=True, right_index=True)#finding trainrow
                           .merge(gatest_events[['testrow']], how='left', left_index=True, right_index=True)#finding testrow
                           .reset_index())
    app_label = app_label.loc[app_label.app_id.isin(app_events.app_id.unique())]
    app_label['app'] = appencoder.transform(app_label.app_id)
    labelencoder = LabelEncoder().fit(app_label.label_id)
    app_label['label'] = labelencoder.transform(app_label.label_id)
    nlabels = len(labelencoder.classes_)
    devicelabels = (deviceapps[['device_id','app']]
                    .merge(app_label[['app','label']])
                    .groupby(['device_id','label'])['app'].agg(['size'])
                    .merge(gatrain_events[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(gatest_events[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())
    events['hour'] = events['timestamp'].map(lambda x:pd.to_datetime(x).hour)
    events['hourbin'] = [1 if ((x>=1)&(x<=6)) else 2 if ((x>=7)&(x<=12)) else 3 if ((x>=13)&(x<=18)) else 4 for x in events['hour']]
    events.hour=events.hour.astype(str)
    events.hourbin=events.hourbin.astype(str)
    hourjoin = events.groupby("device_id")["hour"].apply(lambda x: " ".join('0'+str(s) for s in x))
    hourbinjoin=events.groupby("device_id")["hourbin"].apply(lambda x: " ".join('0'+str(s) for s in x))
    daysjoin=events['timestamp'].dt.day_name()
    events['day']=daysjoin.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6})
    daysjoin = events.groupby("device_id")["day"].apply(lambda x: " ".join("0"+str(s) for s in x))
    median_lat = events.groupby("device_id")["latitude"].agg('median')
    median_lon=events.groupby("device_id")["longitude"].agg('median')
    com=pd.concat([median_lat, median_lon], axis=1)
    kmeans = KMeans(n_clusters=10, random_state=0).fit(com)
    clustered_geo_features=pd.Series(kmeans.labels_)
    clustered_geo_features.index=median_lon.index
    apps = app_events.groupby("event_id")["is_active"].apply(lambda x: " ".join(str(s) for s in x))
    events["apps_active"] = events.index.map(apps)
    active_apps_events = events.groupby("device_id")["apps_active"].apply(lambda x: " ".join(str(s) for s in x if str(s)!='nan'))
    print("Data Preparation complete,time elapsed:",datetime.now()-start)
    ######################without_events###############################################
    gatrain['model'] = phone['model']
    gatest['model'] = phone['model']
    Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                           (gatrain.trainrow, gatrain.model)))
    Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                           (gatest.testrow, gatest.model)))
    gatrain['brand'] = phone['brand']
    gatest['brand'] = phone['brand']
    Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                           (gatrain.trainrow, gatrain.brand)))
    Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                           (gatest.testrow, gatest.brand)))
    Xtrain_whole = hstack((Xtr_brand, Xtr_model), format='csr')
    targetencoder = LabelEncoder().fit(gatrain.group)
    y = targetencoder.transform(gatrain.group)
    gatest_noevents['model']=phone['model']
    gatest_noevents['brand']=phone['brand']
    gatest_noevents_model = csr_matrix((np.ones(gatest_noevents.shape[0]), 
                           (gatest_noevents.testrow, gatest_noevents.model)))
    gatest_noevents_brand= csr_matrix((np.ones(gatest_noevents.shape[0]), 
                           (gatest_noevents.testrow, gatest_noevents.brand)))
    xtest_noevents=hstack((gatest_noevents_brand, gatest_noevents_model), format='csr')
    model_list_1=[]
    for i in range(5):
        model=load_model('saved_models/noevents/nn'+str(i+1))
        model_list_1.append(model)
    avg_pred1=np.zeros((xtest_noevents.shape[0],12))
    for i in range(len(model_list_1)):
        test_pred=model_list_1[i].predict_proba(xtest_noevents)
        avg_pred1+=test_pred
    avg_pred1/=len(model_list_1)
    print("Finished evaluating for  device id's without events,time elasped:",datetime.now()-start)
    ######################with_events###############################################
    gatrain_events['brand']=phone['brand']
    gatest_events['brand']=phone['brand']
    Xtr_events_brand = csr_matrix((np.ones(gatrain_events.shape[0]), # Number of Rows/Devices
                           (gatrain_events.trainrow, gatrain_events.brand)),shape=(gatrain_events.shape[0],nbrand))
    Xte_events_brand = csr_matrix((np.ones(gatest_events.shape[0]), # Number of Rows/Devices
                           (gatest_events.testrow, gatest_events.brand)),shape=(gatest_events.shape[0],nbrand))
    gatrain_events['model']=phone['model']
    gatest_events['model']=phone['model']
    Xtr_events_model = csr_matrix((np.ones(gatrain_events.shape[0]), 
                           (gatrain_events.trainrow, gatrain_events.model)),shape=(gatrain_events.shape[0],nmodel))
    Xte_events_model = csr_matrix((np.ones(gatest_events.shape[0]), 
                           (gatest_events.testrow, gatest_events.model)),shape=(gatest_events.shape[0],nmodel))
    d = deviceapps.dropna(subset=['trainrow'])
    Xtr_events_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                          shape=(gatrain_events.shape[0],napps))
    d = deviceapps.dropna(subset=['testrow'])
    Xte_events_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                          shape=(gatest_events.shape[0],napps))
    d = devicelabels.dropna(subset=['trainrow'])
    Xtr_events_labels = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                          shape=(gatrain_events.shape[0],nlabels))
    d = devicelabels.dropna(subset=['testrow'])
    Xte_events_labels = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                          shape=(gatest_events.shape[0],nlabels))
    gatrain_events["hourjoin"]=gatrain_events.index.map(hourjoin)
    gatest_events["hourjoin"]=gatest_events.index.map(hourjoin)
    vectorizer=TfidfVectorizer()
    vectorizer.fit(gatrain_events['hourjoin'].values)
    X_tr_hourjoin_tfidf = vectorizer.transform(gatrain_events['hourjoin'].values)
    X_te_hourjoin_tfidf = vectorizer.transform(gatest_events['hourjoin'].values)
    gatrain_events["hourbinjoin"]=gatrain_events.index.map(hourbinjoin)
    gatest_events["hourbinjoin"]=gatest_events.index.map(hourbinjoin)
    vectorizer=CountVectorizer(binary=True)
    vectorizer.fit(gatrain_events['hourbinjoin'].values)
    X_tr_hourbinjoin_onehot = vectorizer.transform(gatrain_events['hourbinjoin'].values)
    X_te_hourbinjoin_onehot = vectorizer.transform(gatest_events['hourbinjoin'].values)
    gatrain_events["daysjoin"]=gatrain_events.index.map(daysjoin)
    gatest_events["daysjoin"]=gatest_events.index.map(daysjoin)
    vectorizer=TfidfVectorizer()
    vectorizer.fit(gatrain_events['daysjoin'].values)
    X_tr_daysjoin_tfidf = vectorizer.transform(gatrain_events['daysjoin'].values)
    X_te_daysjoin_tfidf = vectorizer.transform(gatest_events['daysjoin'].values)
    gatrain_events["latitude"]=gatrain_events.index.map(median_lat)
    gatest_events["latitude"]=gatest_events.index.map(median_lat)
    scaler=StandardScaler()
    scaler.fit(gatrain_events['latitude'].values.reshape(-1,1))
    X_tr_event_lat = scaler.transform(gatrain_events['latitude'].values.reshape(-1,1))
    X_te_event_lat = scaler.transform(gatest_events['latitude'].values.reshape(-1,1))
    gatrain_events["longitude"]=gatrain_events.index.map(median_lon)
    gatest_events["longitude"]=gatest_events.index.map(median_lon)
    scaler=StandardScaler()
    scaler.fit(gatrain_events['longitude'].values.reshape(-1,1))
    X_tr_event_lon = scaler.transform(gatrain_events['longitude'].values.reshape(-1,1))
    X_te_event_lon = scaler.transform(gatest_events['longitude'].values.reshape(-1,1))
    gatrain_events["locationbin"]=gatrain_events.index.map(clustered_geo_features)
    gatest_events["locationbin"]=gatest_events.index.map(clustered_geo_features)
    vectorizer= OneHotEncoder()
    vectorizer.fit(gatrain_events['locationbin'].values.reshape(-1,1))
    X_tr_clus = vectorizer.transform(gatrain_events['locationbin'].values.reshape(-1,1))
    X_te_clus = vectorizer.transform(gatest_events['locationbin'].values.reshape(-1,1))
    gatrain_events['apps_active']=gatrain_events.index.map(active_apps_events)
    gatest_events['apps_active']=gatest_events.index.map(active_apps_events)
    vectorizer=TfidfVectorizer()
    vectorizer.fit(gatrain_events['apps_active'].values)
    X_tr_active = vectorizer.transform(gatrain_events['apps_active'].values)
    X_te_active = vectorizer.transform(gatest_events['apps_active'].values)
    X_train_events=hstack((Xtr_events_brand,Xtr_events_model,Xtr_events_labels,X_tr_hourjoin_tfidf,X_tr_hourbinjoin_onehot,X_tr_daysjoin_tfidf,X_tr_event_lat,X_tr_event_lon,Xtr_events_app,X_tr_active,X_tr_clus),format='csr')
    X_test_events =hstack((Xte_events_brand,Xte_events_model,Xte_events_labels,X_te_hourjoin_tfidf,X_te_hourbinjoin_onehot,X_te_daysjoin_tfidf,X_te_event_lat,X_te_event_lon,Xte_events_app,X_te_active,X_te_clus),format='csr')

    model_list_1=[]
    for i in range(5):
        model=load_model('saved_models/events/nn1/nn1'+str(i+1))
        model_list_1.append(model)
    avg_pred2=np.zeros((X_test_events.shape[0],12))
    for i in range(len(model_list_1)):
        test_pred=model_list_1[i].predict_proba(X_test_events)
        avg_pred2+=test_pred
    avg_pred2/=len(model_list_1)

    model_list_1=[]
    for i in range(5):
        model=load_model('saved_models/events/nn2/nn2'+str(i+1))
        model_list_1.append(model)
    avg_pred3=np.zeros((X_test_events.shape[0],12))
    for i in range(len(model_list_1)):
        test_pred=model_list_1[i].predict_proba(X_test_events)
        avg_pred3+=test_pred
    avg_pred3/=len(model_list_1)
    print("Finished evaluating for  device id's with events,time elasped:",datetime.now()-start)
    ######################Ensemble###############################################
    test1=(1*avg_pred1)
    test2=(0.5*avg_pred2)+(0.5*avg_pred3)
    gatrain=pd.read_csv('data/gender_age_train.csv',index_col = 'device_id')
    targetencoder = LabelEncoder().fit(gatrain.group)
    y = targetencoder.transform(gatrain.group)
    nclasses = len(targetencoder.classes_)
    pred_1 = pd.DataFrame(test1, index = gatest_noevents.index, columns=targetencoder.classes_)
    pred_2 = pd.DataFrame(test2, index = gatest_events.index, columns=targetencoder.classes_)
    final_pred=pd.concat([pred_1,pred_2], axis=0)
    final_pred.to_csv(file,index=True)
    print("Saved predictions file in"+file)

In [28]:
file='results/final.csv'
pipeline(gatrain,gatest,phone,app_label,label_cat,app_events,events,file)

Data Preparation complete,time elapsed: 0:02:22.839516
Finished evaluating for  device id's without events,time elasped: 0:03:22.465286
Finished evaluating for  device id's with events,time elasped: 0:06:34.382918
Saved predictions file inresults/final.csv
