### Example 5-3. Feature hashing for word features

In [20]:
word_list = [100,1000,120]

In [21]:
def hash_features(word_list, m):
    output = [0] * m
    for word in word_list:
        index = int(word) % m
        output[index] += 1
    return output

In [29]:
100 % 17

15

In [30]:
1000 % 17

14

In [31]:
120 % 17

1

In [28]:
hash_features(word_list, 17)

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]

In [32]:
hash_features(word_list, 5)

[3, 0, 0, 0, 0]

### Example 5-4. Signed feature hashing

In [38]:
word_list = [100,100,120,111,157]

In [39]:
def hash_features(word_list, m):
    output = [0] * m
    for word in word_list:
        index = int(word) % m
        sign_bit = int(word) % 2
        if (sign_bit == 0):
            output[index] -= 1
        else:
            output[index] += 1
    return output

In [41]:
hash_features(word_list, 4)

[-3, 1, 0, 1]

### Example 5-5. Feature hashing (a.k.a. “the hashing trick”)

In [1]:
import pandas as pd
import json

In [2]:
f = open('/Users/liouscott/Documents/scott/cathay-ml/cathay-dsml-data/20190506/yelp_academic_dataset_review.json')

In [3]:
js = []
for i in range(10000):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)

In [4]:
review_df

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1.0,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
1,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg
2,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw
3,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg
4,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ
5,eU_713ec6fTGNO4BegRaww,0,2013-01-20 13:25:59,0,fdiNeiN_hoCxCMy2wTRW9g,4.0,I'll be the first to admit that I was not exci...,0,w31MKYsNFMrjhWxxAb5wIw
6,3fw2X5bZYeW9xCz_zGhOHg,5,2016-05-07 01:21:02,4,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5,jlu4CztcSxrKx56ba1a5AQ
7,zvO-PJCpNk4fgAVUnExYAA,1,2010-10-05 19:12:35,1,8e9HxxLjjqc9ez5ezzN7iQ,1.0,This place has gone down hill. Clearly they h...,3,d6xvYpyzcfbF_AZ8vMB7QA
8,b2jN2mm9Wf3RcrZCgfo1cg,0,2015-01-18 14:04:18,0,qrffudO73zsslZbe8B9D3Q,2.0,I was really looking forward to visiting after...,1,sG_h0dIzTKWa3Q6fmb4u-g
9,oxwGyA17NL6c5t1Etg5WgQ,1,2012-02-29 21:52:43,0,RS_GTIT6836bCaPy637kNQ,3.0,It's a giant Best Buy with 66 registers. I do...,1,nMeCE5-xsdleyxYuNZ_7rA


In [5]:
m = len(review_df.business_id.unique())

In [6]:
m

4618

In [7]:
from sklearn.feature_extraction import FeatureHasher

In [8]:
h = FeatureHasher(n_features=m, input_type='string')

In [9]:
f = h.transform(review_df['business_id'])

In [10]:
review_df['business_id'].unique().tolist()[0:5]

['ujmEBvifdJM6h6RLv4wQIg',
 'NZnhc2sEQy3RmzKTZnqtwQ',
 'WTqjgwHlXbSFevF32_DJVw',
 'ikCg8xy5JIg_NGPx-MSIDA',
 'b1b1eb3uo-w561D0ZfCEiQ']

In [11]:
f.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
from sys import getsizeof

In [13]:
print('Our pandas Series, in bytes: ', getsizeof(review_df['business_id']))

Our pandas Series, in bytes:  790104


In [14]:
print('Our hashed numpy array, in bytes: ', getsizeof(f))

Our hashed numpy array, in bytes:  56


In [19]:
len(f.toarray()[1].tolist())

4618