## Practice 3 - Manual Function for One-hot encoding of text
### Strictly used for internal purpose in Singapore Polytechnic. Do not disclose!

In [1]:
documents = ["Students are learning NLP.",
             "NLP workshop is interesting",
             "Students are studying math",
             "Math is foundation of NLP"]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

['students are learning nlp',
 'nlp workshop is interesting',
 'students are studying math',
 'math is foundation of nlp']

In [21]:
#Build the vocabulary
vocab = {}
id = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            vocab[word] = id
            id +=1
print(vocab)

{'students': 0, 'are': 1, 'learning': 2, 'nlp': 3, 'workshop': 4, 'is': 5, 'interesting': 6, 'studying': 7, 'math': 8, 'foundation': 9, 'of': 10}


In [25]:
#Get one hot representation for any string based on this vocabulary. 
#If the word exists in the vocabulary, its representation is returned. 
#If not, a list of zeroes is returned for that word. 
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab)
        if word in vocab:
            temp[vocab[word]] = 1
        onehot_encoded.append(temp)
    return onehot_encoded

In [26]:
print(processed_docs[1])
get_onehot_vector(processed_docs[1]) #one hot representation for a text from our corpus.

nlp workshop is interesting


[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [29]:
#one hot representation for a random text, using the above vocabulary
get_onehot_vector("nlp is harder than math") 

[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]

## One-hot encoding using scikit -learn

In [48]:
S1 = 'can i eat the pizza you can eat the pizza'
values = S1.split()

In [55]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

print("The data: ",values)

# Label Encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("\nLabel Encoded:",integer_encoded)

The data:  ['can', 'i', 'eat', 'the', 'pizza', 'you', 'can', 'eat', 'the', 'pizza']

Label Encoded: [0 2 1 4 3 5 0 1 4 3]


In [58]:
# One-hot Encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded.reshape(-1, 1))
print("The data: ", values)
print("\nOne-hot Encoded:\n",onehot_encoded)

The data:  ['can', 'i', 'eat', 'the', 'pizza', 'you', 'can', 'eat', 'the', 'pizza']

One-hot Encoded:
 [[1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]]
