In [1]:
import numpy as np

## Tokenizer

### NLTK word tokenize, sentence -> list of words

In [2]:
import nltk

In [3]:
sentence = "The quick brown fox jumps over the lazy dog."
words = nltk.word_tokenize(sentence)
print(words)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


### Examples

#### SQL text

In [4]:
sql_text = '''SELECT LAT_N, CITY, TEMP_F
FROM STATS, STATION
WHERE MONTH = 7
AND STATS.ID = STATION.ID
ORDER BY TEMP_F;'''

words = nltk.word_tokenize(sql_text)
print(words)

['SELECT', 'LAT_N', ',', 'CITY', ',', 'TEMP_F', 'FROM', 'STATS', ',', 'STATION', 'WHERE', 'MONTH', '=', '7', 'AND', 'STATS.ID', '=', 'STATION.ID', 'ORDER', 'BY', 'TEMP_F', ';']


## Encoding

### Label encoding

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le = LabelEncoder()

le.fit([1, 2, 2, 6])
print("fit([1 2 2 6])")
print("classes:", le.classes_)

trans_res = le.transform([1, 1, 2, 6])
print("transform([1 1 2 6])", trans_res)

inv_trans_res = le.inverse_transform([0, 0, 1, 2])
print("inverse_transform([0 0 1 2])", inv_trans_res)

fit([1 2 2 6])
classes: [1 2 6]
transform([1 1 2 6]) [0 0 1 2]
inverse_transform([0 0 1 2]) [1 1 2 6]


#### String

In [7]:
sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']

le = LabelEncoder()

le.fit(sentence)
print("fit(['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.'])")
print("classes:", le.classes_)

trans_res = le.transform(['quick', 'brown', 'fox'])
print("transform(['quick', 'brown', 'fox'])", trans_res)

inv_trans_res = le.inverse_transform([8, 2, 4])
print("inverse_transform([8 2 4])", inv_trans_res)

fit(['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.'])
classes: ['.' 'The' 'brown' 'dog' 'fox' 'jumps' 'lazy' 'over' 'quick' 'the']
transform(['quick', 'brown', 'fox']) [8 2 4]
inverse_transform([8 2 4]) ['quick' 'brown' 'fox']


#### Missing values

ValueError: y contains previously unseen labels: [3]

In [8]:
le = LabelEncoder()

le.fit([1, 2, 2, 6])

#trans_res = le.transform([1, 1, 3, 6])

LabelEncoder()

### Incremental label encoding

In [9]:
import numpy as np

class IncrementalLabelEncoder:
    def __init__(self):
        self.dict = {}
        
    @property
    def classes_(self):
        return list(self.dict.keys())
    
    def partial_fit(self, y):
        y_uniq = np.unique(y)
        for elem in y_uniq:
            if elem not in self.dict:
                self.dict[elem] = len(self.dict)
                
    def transform(self, y):
        return [self.dict[elem] for elem in y]

In [10]:
sentences = ['The quick brown fox jumps over the lazy dog.', 'The lazy dog was caught on camera.']

sentence_words = [ nltk.word_tokenize(sentence) for sentence in sentences ]

ile = IncrementalLabelEncoder()

for words in sentence_words:
    ile.partial_fit(words)
    
print("classes:", ile.classes_)

trans_res = ile.transform(['quick', 'brown', 'fox'])
print("transform(['quick', 'brown', 'fox'])", trans_res)

classes: ['.', 'The', 'brown', 'dog', 'fox', 'jumps', 'lazy', 'over', 'quick', 'the', 'camera', 'caught', 'on', 'was']
transform(['quick', 'brown', 'fox']) [8, 2, 4]


### One-hot encoding

#### Sklearn

Should be used along with LabelEncoder

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
X = np.arange(15).reshape(3, 5)
print("X\n", X)

le = LabelEncoder()
X_le = np.apply_along_axis(le.fit_transform, 0, X)
print("X label encoded\n", X_le)

oe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_oe = oe.fit_transform(X_le)
print("X one-hot encoded\n", X_oe)

X
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
X label encoded
 [[0 0 0 0 0]
 [1 1 1 1 1]
 [2 2 2 2 2]]
X one-hot encoded
 [[1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]]


### Random vector encoding

#### Label to random vector map

In [15]:
import numpy as np

def init_label_rand_vec_map(length, vec_dim):
    return np.random.random((length, vec_dim))