# Kaggle competition

In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
%pylab inline
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import subprocess

Populating the interactive namespace from numpy and matplotlib


In [2]:
def train_model(train_file, params) :
    
    err = subprocess.call('vw --oaa 550 --random_seed 7 -b 26 %s -k --cache_file cache.tmp %s \
-f kaggle_data/my_vw_model.vw' % (params, train_file) )
    
    return err

In [3]:
def get_predict(file_name) :
    err = subprocess.call('vw -i kaggle_data/my_vw_model.vw -t -d %s -p kaggle_data/my_vw_pred.csv --random_seed 7' % file_name)

    if err != 255 :
        return pd.read_csv('kaggle_data/my_vw_pred.csv', header=None)
    else :
        print ("prediction error")
        return None

In [4]:
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

# Additional features

In [5]:
def load_data(file_name, user_id=True) :
    
    site_ids = list(['site%d'%i for i in range(1,11)])
    # secs_names = list(['secs%d'%i for i in range(1,11)]) 
    # add_names = list(['year', 'month', 'd_week', 'hour', 'w_end', 'period', 'sess_dur', 'utc' ])
    add_names = list(['utc', 'w_end', 'period' ])
       
    new_names = ['sess_len'] + site_ids + add_names
    
    if (user_id) :
        new_names.append('user_id')
    
    # print (new_names)
    
    data = pd.read_csv(file_name, index_col='session_id').fillna(0)
    N,_ = data.shape
    
    for j in range (1,11) :
        data['site%d'%j] = data['site%d'%j].astype(int)
        data['time%d'%j] = pd.to_datetime( data['time%d'%j] )
        data['secs%d'%j] = data['time%d'%j].apply( lambda d: (d - datetime.datetime(2013,1,1)).total_seconds() ).astype(int) 
        
    #data['month'] =  data.time1.apply ( lambda d: d.month )
    #data['year'] =   data.time1.apply ( lambda d: d.year)
        
    data['hour']  =  data.time1.apply ( lambda d: d.hour )
    data['d_week'] = data.time1.apply ( lambda d: d.dayofweek+1 )
    data['w_end'] =  data.d_week.apply( lambda w: 1 if w==1 or w==6  else 2)
    data['period'] = data.hour.apply  ( lambda h: 1 if h==7 else 2 if h >= 8 and h <=18 else 3  )
    data['utc'] =    data.secs1.apply  ( lambda s: s/(60*60)).astype(int)
    
    # calculate durations in secs
    ses_len = np.zeros(N, dtype=int)
    # ses_dur = np.zeros(N, dtype=int)
    n_sids = 1
    for i in range(N) : #!!!!!!!!!!!!!!!!!!!!!! N
        n_sids = 10
        # start_time = data.loc[i+1,'secs1']
        # end_time = data.loc[i+1,'secs10']
        for j in range (0,20,2) :
            if data.iloc[i,j] == 0 :
                n_sids = int(j/2)
                # end_time = data.loc[i+1, 'secs%d'%(n_sids)]
                break
        ses_len[i] = n_sids
        
        # duration = end_time - start_time
        # if (duration > 0) :
        #    ses_dur[i] = duration
        # else :
        #    ses_dur[i] = 1
         
    data['sess_len'] = ses_len.astype(int)
    # data['sess_dur'] = ses_dur.astype(int)
    
    return data[new_names]

In [10]:
def prepare_vm_file_3(x, y, out_file):
    
    vm_file = open(out_file, 'w')
    
    N = x.shape[0]
    num_lines = 0
    
    for i in range(N):
        if y is not None:
            out_line = str(y[i])
        else:
            out_line = str(1)
            
        session_list = list()
        n_sids = x.iloc[i, 0]
        
        for j in range(n_sids) :
            session_list.append( str(x.iloc[i, j+1]) ) #  + ':' + str(x.iloc[i, 11+j]) )
       
        pos = 11   
        out_line = out_line +' | ' + ' '.join(session_list)
        out_line = out_line +' | ' + str(x.iloc[i, pos+0]) # utc
        out_line = out_line +' | ' + str(x.iloc[i, pos+1]) # w_end
        out_line = out_line +' | ' + str(x.iloc[i, pos+2]) # period
        
        #out_line = out_line +' | ' + str(x.iloc[i, pos+3]) # w_end
        #out_line = out_line +' | ' + str(x.iloc[i, pos+4]) # period
        #out_line = out_line +' | ' + str(x.iloc[i, pos+5]) # duration in secs
        #out_line = out_line +' | ' + str(x.iloc[i, pos+6]) # utc in hours
        
        out_line = out_line + '\n'
        
        # print (out_line)

        vm_file.write(out_line)
        num_lines += 1
    
    # print (N, num_lines)
    vm_file.close()

In [11]:
def check_score_3(train_file, test_file, y_train, y_test, 
                params_str='--loss_function logistic -l 0.8 --decay_learning_rate 1.0' ) :  
    
    err = train_model(train_file, params_str)
    
    if (err == 255) :
        print ("modeling error!")
        return -1, -1

    y_pred_train = get_predict(train_file).values.ravel()
    
    y_pred_test =  get_predict(test_file).values.ravel()
    
    return accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)

In [8]:
%%time
train_df = load_data('kaggle_data/train_sessions.csv')
train_df.to_csv('kaggle_data/train_df.csv', index_label='session_id', float_format='%d')

Wall time: 5min 29s


In [12]:
%%time
user_encoder = LabelEncoder()
train_labels = user_encoder.fit_transform(train_df['user_id']) + 1

X_train, X_valid, y_train, y_valid = train_test_split(train_df, train_labels, 
                                                      test_size=0.3, random_state=7, stratify=train_labels)

prepare_vm_file_3(X_train, y_train, 'kaggle_data/my_train.vw')
prepare_vm_file_3(X_valid, y_valid, 'kaggle_data/my_valid.vw')

Wall time: 4min 6s


In [15]:
%%time
func_list = ['logistic', 'squared', 'hinge', 'quantile']

for func in func_list :
    t0, t1 = check_score_3('kaggle_data/my_train.vw', 'kaggle_data/my_valid.vw', 
                           y_train, y_valid,
                           params_str='--passes 20 --loss_function ' + func + ' -l 0.8 --decay_learning_rate 1.0'
                          )
    print('%s train score: %.4f  test score: %.4f' % (func, t0, t1))

logistic train score: 0.8071  test score: 0.5469
squared train score: 0.7814  test score: 0.5339
hinge train score: 0.7634  test score: 0.5248
quantile train score: 0.7634  test score: 0.5248
Wall time: 9min 36s


In [16]:
%%time
l_rate = ['0.65', '0.7', '0.75']

for func in l_rate :
    t0, t1 = check_score_3('kaggle_data/my_train.vw', 'kaggle_data/my_valid.vw', 
                           y_train, y_valid,
                           params_str='--passes 20 --loss_function logistic  -l ' + func + ' --decay_learning_rate 1.0'
                          )
    print('%s train score: %.4f  test score: %.4f' % (func, t0, t1))

0.65 train score: 0.7854  test score: 0.5433
0.7 train score: 0.8028  test score: 0.5467
0.75 train score: 0.8053  test score: 0.5473
Wall time: 20min 37s


In [20]:
%%time
l_rate = ['0.85', '0.9', '0.95']

for func in l_rate :
    t0, t1 = check_score_3('kaggle_data/my_train.vw', 'kaggle_data/my_valid.vw', 
                           y_train, y_valid,
                           params_str='--passes 20 --loss_function logistic  -l ' + func + ' --decay_learning_rate 1.0'
                          )
    print('%s train score: %.4f  test score: %.4f' % (func, t0, t1))

0.85 train score: 0.8088  test score: 0.5469
0.9 train score: 0.8007  test score: 0.5458
0.95 train score: 0.7999  test score: 0.5450
Wall time: 19min 22s


In [13]:
%%time
l_passes = ['10', '15', '25']

for func in l_passes :
    t0, t1 = check_score_3('kaggle_data/my_train.vw', 'kaggle_data/my_valid.vw', 
                           y_train, y_valid,
                           params_str='--loss_function logistic  -l 0.75 --passes ' + func + ' --decay_learning_rate 1.0'
                          )
    print('%s train score: %.4f  test score: %.4f' % (func, t0, t1))

10 train score: 0.7824  test score: 0.5413
15 train score: 0.7962  test score: 0.5451
25 train score: 0.8108  test score: 0.5468
Wall time: 19min 45s


In [14]:
%%time
test_df = load_data('kaggle_data/test_sessions.csv', user_id=False)
test_df.to_csv('kaggle_data/test_df.csv', index_label='session_id', float_format='%d')

Wall time: 2min 14s


In [15]:
def prep_submission_3(params_str) :
        
    train_data = pd.read_csv('kaggle_data/train_df.csv', index_col='session_id')
    test_data = pd.read_csv('kaggle_data/test_df.csv', index_col='session_id')
    
    print (".csv loaded -> ", end=" ")
    
    user_encoder = LabelEncoder()
    train_labels = user_encoder.fit_transform(train_data['user_id']) + 1
    
    prepare_vm_file_3(train_data, train_labels, 'kaggle_data/my_full_train.vw')
    
    prepare_vm_file_3(test_data, None, 'kaggle_data/my_full_test.vw')
    
    print (".vw prepared -> ", end=" ")
    
    err = train_model('kaggle_data/my_full_train.vw', params_str)
    
    if (err == 255) :
        print ("modeling error!")
        return
    else :
        print ("model trained -> ", end=" ")
    
    y_pred = get_predict('kaggle_data/my_full_test.vw')
    
    print ("predict done -> ", end=" ")
    
    y_subm = user_encoder.inverse_transform( y_pred - 1 )
    
    write_to_submission_file(y_subm, 'kaggle_data/sokolov_submission.csv')
    
    print ("result file: kaggle_data/sokolov_submission.csv")

In [16]:
%%time
prep_submission_3( '--loss_function logistic --passes 25 -l 0.75 --decay_learning_rate 1.0' )

.csv loaded ->  .vw prepared ->  model trained ->  predict done ->  result file: kaggle_data/sokolov_submission.csv
Wall time: 14min 47s
