In [2]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
import numpy as np
import itertools
from time import time
import pandas as pd
import tqdm
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from subprocess import Popen,PIPE,STDOUT
from sklearn.preprocessing import LabelEncoder

In [53]:
def train_vw_model(train_vw_file, model_filename, num_classes=10,
                   bit_precision=28,l2=1e-8, passes=3,
                   seed=17, quiet=True,add_args=None):
    init_time = time()
    vw_call_string = ('vw --oaa {num_classes} {train_vw_file} ' + 
                       '-f {model_filename} -b {bit_precision} --random_seed {seed}').format(
                       num_classes=num_classes, train_vw_file=train_vw_file, 
                       model_filename=model_filename, bit_precision=bit_precision, seed=seed)
            
    if passes > 1:
         vw_call_string += ' -k --passes={} --cache_file {}'.format(passes, 
                            model_filename.replace('.vw', '.cache'))
    if quiet:
        vw_call_string += ' --quiet'
    
    vw_call_string += " --l2 {}".format(l2)
    
    if add_args:
        vw_call_string += " {}".format(" ".join(add_args))   
    
    proc = Popen(vw_call_string,stdout=PIPE,shell=True)
    out,err=proc.communicate()
    print('Success. Elapsed: {} sec.'.format(round(time() - init_time, 2))
          if not err else 'Failed.')
    return out

def test_vw_model(model_filename, test_vw_file, prediction_filename,
                  true_labels, seed=17, quiet=True):
    init_time = time()
    vw_call_string = ('vw -t -i {model_filename} {test_vw_file} ' + 
                       '-p {prediction_filename} --random_seed {seed}').format(
                       model_filename=model_filename, test_vw_file=test_vw_file, 
                       prediction_filename=prediction_filename, seed=seed)
    if quiet:
        vw_call_string += ' --quiet'
        
    proc = Popen(vw_call_string,stdout=PIPE,shell=True)
    out,err=proc.communicate()
        
    if not err: # the call resulted OK
        vw_pred = np.loadtxt(prediction_filename)
        accuracy = accuracy_score(true_labels, vw_pred)
        print("Accuracy: {}%. Elapsed: {} sec.".format(
            round(100 * accuracy, 2), 
            round(time() - init_time, 2)))
        return accuracy
    else:
        print('Failed.')

In [None]:
! ls /tmp/working/machinleaning/ml/6/ident/data/capstone_user/

In [14]:
# Поменяйте на свой путь к данным
PATH_TO_DATA = '/tmp/working/machinlearning/6/ident/kaggle_data/'

In [20]:
with open(os.path.join(PATH_TO_DATA, 'X_train_sparse.pkl'), 'rb') as X_train_sparse_pkl:
    X_train_sparse = pickle.load(X_train_sparse_pkl)
with open(os.path.join(PATH_TO_DATA, 'X_test_sparse.pkl'), 'rb') as X_test_sparse_pkl:
    X_test_sparse = pickle.load(X_test_sparse_pkl)
with open(os.path.join(PATH_TO_DATA, 'train_target.pkl'), 'rb') as train_target_pkl:
    y = pickle.load(train_target_pkl)

In [None]:
import multiprocessing

def nonzero(x,mark=None):
    
    nonzero_index = x.nonzero()[1]
    values = x[:,nonzero_index].data

    pair=(str(index)+':'+str(val) for index,val in zip(nonzero_index,values))
    features = ' '.join(pair)
    
    ret_val = "{0} | sites {1}".format(mark,features)\
    if not mark is None else\
    "1 | sites {0}".format(features)
    
    return ret_val

def sparse_matrix_to_vw(X_sparse, y=None, out_file='tmp.vw'):
    pool = multiprocessing.Pool(processes=4)
    if not y is None:
        features = pool.starmap(nonzero,\
                            ((X_sparse[i,:],y[i]) for i in tqdm.tqdm_notebook(range(X_sparse.shape[0]))))
    else:
        features = pool.map(nonzero,\
                            (X_sparse[i,:] for i in tqdm.tqdm_notebook(range(X_sparse.shape[0]))))
    
    pool.close()
    pool.join()
    with open(out_file,'w') as f:
        for line in features:
            f.write(line+'\n')

In [12]:
class_encoder = LabelEncoder().fit(y)

y_for_vw = class_encoder.transform(y) + 1

X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, 
                                                     random_state=17, stratify=y_for_vw)

# Поменяйте на свой путь к данным
PATH_TO_DATA = '/tmp/working/machinlearning/6/ident/data/capstone_user/' 

train_part_vw = os.path.join(PATH_TO_DATA, 'train_part.vw')
valid_vw = os.path.join(PATH_TO_DATA, 'valid.vw')
train_vw = os.path.join(PATH_TO_DATA, 'train.vw')
test_vw = os.path.join(PATH_TO_DATA, 'test.vw')
model = os.path.join(PATH_TO_DATA, 'vw_model_full.vw')
model_part = os.path.join(PATH_TO_DATA, 'vw_model_part.vw')
pred = os.path.join(PATH_TO_DATA, 'vw_pred.csv')

train_vw_model(train_part_vw,model_part,num_classes=400)

test_vw_model(model_part,valid_vw,pred,y_valid,quiet=False)

def train_vw_model(train_vw_file, model_filename, num_classes=10,
                   bit_precision=28,l2=1e-8, passes=1,
                   seed=17, quiet=True):
    init_time = time()
    vw_call_string = ('vw --oaa {num_classes} {train_vw_file} ' + 
                       '-f {model_filename} -b {bit_precision} --random_seed {seed}').format(
                       num_classes=num_classes, train_vw_file=train_vw_file, 
                       model_filename=model_filename, bit_precision=bit_precision, seed=seed)
            
    if passes > 1:
         vw_call_string += ' -k --passes={} --cache_file {}'.format(passes, 
                            model_filename.replace('.vw', '.cache'))
    if quiet:
        vw_call_string += ' --quiet'
    
    vw_call_string += " --l2 {}".format(l2)
    
    
    proc = Popen(vw_call_string,stdout=PIPE,shell=True)
    out,err=proc.communicate()
    print('Success. Elapsed: {} sec.'.format(round(time() - init_time, 2))
          if not err else 'Failed.')
    return out

def test_vw_model(model_filename, test_vw_file, prediction_filename,
                  true_labels, seed=17, quiet=True):
    init_time = time()
    vw_call_string = ('vw -t -i {model_filename} {test_vw_file} ' + 
                       '-p {prediction_filename} --random_seed {seed}').format(
                       model_filename=model_filename, test_vw_file=test_vw_file, 
                       prediction_filename=prediction_filename, seed=seed)
    if quiet:
        vw_call_string += ' --quiet'
        
    proc = Popen(vw_call_string,stdout=PIPE,shell=True)
    out,err=proc.communicate()
        
    if not err: # the call resulted OK
        vw_pred = np.loadtxt(prediction_filename)
        print("Accuracy: {}%. Elapsed: {} sec.".format(
            round(100 * accuracy_score(true_labels, vw_pred), 2), 
            round(time() - init_time, 2)))
    else:
        print('Failed.')

In [14]:
class_encoder = LabelEncoder().fit(y)
y_for_vw = class_encoder.transform(y) + 1

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, 
                                                     random_state=17, stratify=y_for_vw)

In [16]:
train_part_vw = os.path.join(PATH_TO_DATA, 'train_part.vw')
valid_vw = os.path.join(PATH_TO_DATA, 'valid.vw')
train_vw = os.path.join(PATH_TO_DATA, 'train.vw')
test_vw = os.path.join(PATH_TO_DATA, 'test.vw')
model = os.path.join(PATH_TO_DATA, 'vw_model_full.vw')
model_part = os.path.join(PATH_TO_DATA, 'vw_model_part.vw')
pred = os.path.join(PATH_TO_DATA, 'vw_pred.csv')

In [19]:
train_vw_model(train_part_vw,model_part,l2=1e-5,num_classes=400)

test_vw_model(model_part,valid_vw,pred,y_valid,quiet=False)

Success. Elapsed: 21.39 sec.
Accuracy: 4.59%. Elapsed: 1.23 sec.


In [30]:
index = np.unique(np.random.randint(1,1000,size=121))

In [31]:
len(index)

113

In [49]:
l2_linspace = np.linspace(1e-12,1e-5,30)

In [47]:
train_vw_model(train_part_vw,model_part,l2=1e-8,num_classes=400)
test_vw_model(model_part,valid_vw,pred,y_valid,quiet=False)

Success. Elapsed: 26.51 sec.
Accuracy: 34.94%. Elapsed: 1.16 sec.


0.34939275684744153

In [54]:
train_vw_model(train_part_vw,model_part,l2=1e-12,num_classes=400,add_args=['--loss_function hinge'])
acc=test_vw_model(model_part,valid_vw,pred,y_valid,quiet=False)

Success. Elapsed: 11.31 sec.
Accuracy: 32.53%. Elapsed: 1.09 sec.


In [58]:
train_vw_model(train_part_vw,model_part,l2=1e-12,num_classes=400,add_args=['--loss_function logistic'])
acc=test_vw_model(model_part,valid_vw,pred,y_valid,quiet=False)

Success. Elapsed: 79.51 sec.
Accuracy: 31.26%. Elapsed: 1.24 sec.


In [60]:
from itertools import product

In [66]:
it1=product(\
                                     np.linspace(1e-12,1e-5,5)\
                                     ,['squared', 'classic', 'hinge', 'logistic'])
it2=acc_list
it3=zip(it1,it2)

for (loss_f, l2_),name_ in tqdm.tqdm(it3):
    print(loss_f, l2_,name_)


0it [00:00, ?it/s][A
20it [00:00, 1892.95it/s]

1e-12 squared (9.9999999999999998e-13, 0.34952040555818958)
1e-12 classic (9.9999999999999998e-13, 0.3500127648710748)
1e-12 hinge (9.9999999999999998e-13, 0.32526715051606553)
1e-12 logistic (9.9999999999999998e-13, 0.3126299281520114)
2.50000075e-06 squared (2.5000007500000002e-06, 0.15148254859768773)
2.50000075e-06 classic (2.5000007500000002e-06, 0.16120208614464424)
2.50000075e-06 hinge (2.5000007500000002e-06, 0.32160180896458662)
2.50000075e-06 logistic (2.5000007500000002e-06, 0.024617965644261278)
5.0000005e-06 squared (5.000000500000001e-06, 0.061836682592362961)
5.0000005e-06 classic (5.000000500000001e-06, 0.056438965680732339)
5.0000005e-06 hinge (5.000000500000001e-06, 0.31547467084868158)
5.0000005e-06 logistic (5.000000500000001e-06, 0.024380903752872097)
7.50000025e-06 squared (7.5000002500000009e-06, 0.032331594879463146)
7.50000025e-06 classic (7.5000002500000009e-06, 0.029578029833327253)
7.50000025e-06 hinge (7.5000002500000009e-06, 0.30641161238557207)
7.50000025

[A

In [64]:
acc_list = []
for l2_,loss_f in tqdm.tqdm_notebook(product(\
                                     np.linspace(1e-12,1e-8,5)\
                                     ,['squared', 'classic', 'hinge', 'logistic'])):
    train_vw_model(train_part_vw,model_part,l2=l2_,num_classes=400,add_args=['--loss_function {}'.format(loss_f)])
    acc=test_vw_model(model_part,valid_vw,pred,y_valid,quiet=False)
    acc_list.append((l2_,acc))

Success. Elapsed: 26.95 sec.
Accuracy: 34.95%. Elapsed: 1.13 sec.
Success. Elapsed: 18.78 sec.
Accuracy: 35.0%. Elapsed: 1.1 sec.
Success. Elapsed: 11.33 sec.
Accuracy: 32.53%. Elapsed: 1.06 sec.
Success. Elapsed: 77.69 sec.
Accuracy: 31.26%. Elapsed: 1.22 sec.
Success. Elapsed: 34.8 sec.
Accuracy: 15.15%. Elapsed: 1.21 sec.
Success. Elapsed: 25.15 sec.
Accuracy: 16.12%. Elapsed: 1.19 sec.
Success. Elapsed: 14.48 sec.
Accuracy: 32.16%. Elapsed: 1.07 sec.
Success. Elapsed: 64.23 sec.
Accuracy: 2.46%. Elapsed: 1.27 sec.
Success. Elapsed: 46.18 sec.
Accuracy: 6.18%. Elapsed: 1.24 sec.
Success. Elapsed: 38.75 sec.
Accuracy: 5.64%. Elapsed: 1.26 sec.
Success. Elapsed: 17.49 sec.
Accuracy: 31.55%. Elapsed: 1.09 sec.
Success. Elapsed: 68.35 sec.
Accuracy: 2.44%. Elapsed: 1.23 sec.
Success. Elapsed: 53.0 sec.
Accuracy: 3.23%. Elapsed: 1.21 sec.
Success. Elapsed: 47.88 sec.
Accuracy: 2.96%. Elapsed: 1.23 sec.
Success. Elapsed: 20.47 sec.
Accuracy: 30.64%. Elapsed: 1.11 sec.
Success. Elapsed: 70