In [1]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from glob import glob
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import pickle

In [2]:
import os, os.path
import operator
from collections import Counter

In [3]:
def prepare_train_set(dirname, session_length) :
    
    lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
      
    # download data from all files 
    n_sess = 0
    all_user_list = list()
    all_user_sess = list()
    all_sess_list = list()
    for root, dirs, files in os.walk(dirname):
        for f in files:
            fullpath = os.path.join(root, f)
            if os.path.splitext(fullpath)[1] == '.csv':
                udata = pd.read_csv(fullpath, header=None, names=['usr', 'timestamp', 'site'])
                # udata.drop_duplicates(inplace=True)
                all_sess_list.extend( list(udata['site']) )
                sess_list = lol(list(udata.site), session_length) # split list with magic
                n_sess += len(sess_list) # count total number of sessions
                all_user_sess.append(sess_list )
                all_user_list.append(udata.iloc[0,0])
                
    # build and sort friquency list by frq
    all_sites_frq = sorted(Counter(all_sess_list).items(), key=operator.itemgetter(1), reverse=True)
      

    # build sites dictionary - name: (id, frq),  site id starts from 1
    all_sites_dict = { key: (i, frq) for (i, (key, frq)) in enumerate(all_sites_frq, start=1) }
    
    # change site names for their id from dictionary in session lists
    dt = np.zeros(n_sess * (session_length+1), dtype='int').reshape(n_sess, (session_length+1) )
    k = 0
    for usr, us_sess in zip(all_user_list, all_user_sess) :
        for sess in us_sess :
            dt[k, session_length] = usr   # last in row is user ID
            for i, site in enumerate(sess) :
                dt[k, i] = all_sites_dict.get(site)[0]
            k += 1
   
    cols = list([('s'+ str(x)) for x in range(session_length)] )
    cols.append('id')
    
    # return dataframe w/o duplicates and sorted by frq dictionary
    return pd.DataFrame(dt, columns=cols).drop_duplicates(), all_sites_dict # 
    

In [4]:
train_data_toy, site_freq_3users = prepare_train_set('capstone_websites_data/3users_toy', 10)

In [5]:
train_data_toy.head()

Unnamed: 0,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,id
0,3,2,2,10,2,1,8,6,7,9,1
1,3,1,1,1,0,0,0,0,0,0,1
2,3,2,5,5,2,0,0,0,0,0,2
3,4,1,2,1,2,1,1,6,11,4,3
4,4,1,2,0,0,0,0,0,0,0,3


In [6]:
print (type(site_freq_3users))

<class 'dict'>


In [15]:
with open('capstone_websites_data/site_freq_3users.pkl', 'wb') as site_freq_3users_pkl:
    pickle.dump(site_freq_3users, site_freq_3users_pkl)

In [6]:
# частоты сайтов (второй элемент кортежа) точно должны быть такими, нумерация может быть любой
# list(site_freq_3users)
sorted(site_freq_3users, key=operator.itemgetter(1), reverse=False) 


[('google.com', (1, 9)),
 ('oracle.com', (2, 8)),
 ('meduza.io', (3, 3)),
 ('vk.com', (4, 3)),
 ('mail.google.com', (5, 2)),
 ('football.kulichki.ru', (6, 2)),
 ('accounts.google.com', (7, 1)),
 ('plus.google.com', (8, 1)),
 ('geo.mozilla.org', (9, 1)),
 ('yandex.ru', (10, 1)),
 ('apis.google.com', (11, 1))]

In [7]:
ite_ids, site_freqs = np.unique(train_data_toy.values[:, :-1], return_counts=True)
print (sorted(site_freqs, reverse=True)[1:])

[9, 8, 3, 3, 2, 2, 1, 1, 1, 1, 1]


In [7]:
train_data_10users, site_freq_10users = prepare_train_set('capstone_websites_data/10users', 10)

In [8]:
print (type(site_freq_10users))

<class 'dict'>


In [17]:
train_data_10users.head()

Unnamed: 0,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,id
0,401,478,478,5,174,193,584,134,3,134,31
1,134,3,134,134,204,416,701,192,257,31,31
2,134,3505,221,55,55,3,55,55,416,5,31
3,293,334,898,55,4448,55,55,55,55,199,31
4,123,346,937,55,3629,259,3721,211,2372,694,31


In [9]:
with open('capstone_websites_data/site_freq_10users.pkl', 'wb') as site_freq_10users_pkl:
    pickle.dump(site_freq_10users, site_freq_10users_pkl )

In [10]:
train_data_10users.to_csv('capstone_websites_data/train_data_10users.csv', 
                        index_label='session_id', float_format='%d')

In [71]:
len(train_data_10users)


13084

In [60]:
len(site_freq_10users)


4913

In [11]:
%%time 
train_data_150users, site_freq_150users = prepare_train_set('capstone_websites_data/150users', 10)

Wall time: 2.72 s


In [12]:
with open('capstone_websites_data/site_freq_150users.pkl', 'wb') as site_freq_150users_pkl:
    pickle.dump(site_freq_150users, site_freq_150users_pkl )

In [19]:
print ( len(train_data_150users) )


130786


In [20]:
print (len(site_freq_150users))


27797


In [30]:
top10_popular = list()
for site in site_freq_150users[:10] :
    top10_popular.append(site[0])
answ = ' '.join(top10_popular)
print (answ)

www.google.fr www.google.com www.facebook.com apis.google.com s.youtube.com clients1.google.com mail.google.com plus.google.com safebrowsing-cache.google.com www.youtube.com


In [21]:
train_data_10users.to_csv('capstone_websites_data/train_data_10users.csv', 
                        index_label='session_id', float_format='%d')
train_data_150users.to_csv('capstone_websites_data/train_data_150users.csv', 
                         index_label='session_id', float_format='%d')
# train_data_10users_2 = pd.read_csv('train_data_15users.csv')  
# train_data_150users_2 = pd.read_csv('train_data_150users.csv')

In [6]:
def sparse_array( sess, maxlen ) :
    
    indices = sess.ravel() 
    ln = len( indices )
    data = np.ones(ln, dtype=int)
    indptr = np.array( range(0, ln+1, maxlen), dtype=int )
    
    return csr_matrix( (data, indices, indptr),  dtype=int )[:, 1:]

In [8]:
X_sparse_3users =  sparse_array(train_data_toy.iloc[:, :-1].values, 10)

In [10]:
train_data_toy.head()

Unnamed: 0,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,id
0,3,2,2,10,2,1,8,6,7,9,1
1,3,1,1,1,0,0,0,0,0,0,1
2,3,2,5,5,2,0,0,0,0,0,2
3,4,1,2,1,2,1,1,6,11,4,3
4,4,1,2,0,0,0,0,0,0,0,3


In [9]:
print (X_sparse_3users)

  (0, 2)	1
  (0, 1)	1
  (0, 1)	1
  (0, 9)	1
  (0, 1)	1
  (0, 0)	1
  (0, 7)	1
  (0, 5)	1
  (0, 6)	1
  (0, 8)	1
  (1, 2)	1
  (1, 0)	1
  (1, 0)	1
  (1, 0)	1
  (2, 2)	1
  (2, 1)	1
  (2, 4)	1
  (2, 4)	1
  (2, 1)	1
  (3, 3)	1
  (3, 0)	1
  (3, 1)	1
  (3, 0)	1
  (3, 1)	1
  (3, 0)	1
  (3, 0)	1
  (3, 5)	1
  (3, 10)	1
  (3, 3)	1
  (4, 3)	1
  (4, 0)	1
  (4, 1)	1


In [53]:
def sparse_matrix_to_vw(csr, y=None, out_file='tmp.vw'):
    
    col_index = csr.indices
    row_point = csr.indptr
    
    site_list = list()
    
    for i in range(len(row_point)-1) :
        for site in (col_index[row_point[i]:row_point[i+1]]+1 ) :
            site_list.append(str(site))
        
           # ' '.join(site_list) 
        
        if y is not None:
            label = y[i]
        else :
            label = None
            
        out_line = str(label or '0') + ' |text ' + ' '.join(site_list)  + '\n'
       
        # print (out_line)
        site_list.clear()

In [54]:
sparse_matrix_to_vw(sparse_array(train_data_toy.iloc[:, :-1].values, 10), train_data_toy.iloc[:, -1].values )

1 |text 3 2 2 10 2 1 8 6 7 9

1 |text 3 1 1 1

2 |text 3 2 5 5 2

3 |text 4 1 2 1 2 1 1 6 11 4

3 |text 4 1 2



In [52]:
X_10users,  y_10users  = train_data_10users.iloc[:, :-1].values,  train_data_10users.iloc[:, -1].values
X_150users, y_150users = train_data_150users.iloc[:, :-1].values, train_data_150users.iloc[:, -1].values

In [53]:
X_sparse_10users  = sparse_array( X_10users, 10)

In [54]:
X_sparse_10users.shape

(13084, 4913)

In [55]:
X_sparse_150users = sparse_array( X_150users, 10)

In [56]:
X_sparse_150users.shape

(130786, 27797)

In [46]:
with open('capstone_websites_data/X_sparse_10users.pkl', 'wb') as X10_pkl:
    pickle.dump(X_sparse_10users, X10_pkl)
with open('capstone_websites_data/y_10users.pkl', 'wb') as y10_pkl:
    pickle.dump(y_10users, y10_pkl)
with open('capstone_websites_data/X_sparse_150users.pkl', 'wb') as X150_pkl:
    pickle.dump(X_sparse_150users, X150_pkl)
with open('capstone_websites_data/y_150users.pkl', 'wb') as y150_pkl:
    pickle.dump(y_150users, y150_pkl)
with open('capstone_websites_data/site_freq_3users.pkl', 'wb') as site_freq_3users_pkl:
    pickle.dump(site_freq_3users, site_freq_3users_pkl)
with open('capstone_websites_data/site_freq_10users.pkl', 'wb') as site_freq_10users_pkl:
    pickle.dump(site_freq_10users, site_freq_10users_pkl)
with open('capstone_websites_data/site_freq_150users.pkl', 'wb') as site_freq_150users_pkl:
    pickle.dump(site_freq_150users, site_freq_150users_pkl)

In [58]:
assert X_sparse_10users.shape[1] == len(site_freq_10users)

In [59]:
assert X_sparse_150users.shape[1] == len(site_freq_150users)