# Make computers understand text data by representing text by numbers

In [41]:
import numpy as np
import pandas as pd

#Index-Based Encoding

In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [43]:
corpus = ["i cant wait to get out of lockdown", "the uk is soon going to be free soon", "linkedin is social media"]

In [44]:
token_docs = [doc.split() for doc in corpus]
all_tokens = set([word for sentence in token_docs for word in sentence])
word_to_idx = {token:idx+1 for idx, token in enumerate(all_tokens)}


In [45]:
word_to_idx

{'be': 3,
 'cant': 9,
 'free': 15,
 'get': 6,
 'going': 2,
 'i': 1,
 'is': 7,
 'linkedin': 10,
 'lockdown': 8,
 'media': 12,
 'of': 14,
 'out': 18,
 'social': 11,
 'soon': 5,
 'the': 13,
 'to': 16,
 'uk': 17,
 'wait': 4}

In [46]:
X = np.array([[word_to_idx[token] for token in token_doc] for token_doc in token_docs], dtype=object)

In [47]:
X_padded = pad_sequences(X, padding="post")
X_padded

array([[ 1,  9,  4, 16,  6, 18, 14,  8,  0],
       [13, 17,  7,  5,  2, 16,  3, 15,  5],
       [10,  7, 11, 12,  0,  0,  0,  0,  0]], dtype=int32)

In [48]:
X_df = pd.DataFrame(X_padded) 

In [49]:
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,9,4,16,6,18,14,8,0
1,13,17,7,5,2,16,3,15,5
2,10,7,11,12,0,0,0,0,0


#One-Hot Encoding

In [50]:
from sklearn.preprocessing import OneHotEncoder

In [51]:
X_padded

array([[ 1,  9,  4, 16,  6, 18, 14,  8,  0],
       [13, 17,  7,  5,  2, 16,  3, 15,  5],
       [10,  7, 11, 12,  0,  0,  0,  0,  0]], dtype=int32)

In [52]:
token_ids = X_padded.reshape(-1, 1)

In [53]:
one_hot = OneHotEncoder()
X = one_hot.fit_transform(token_ids)

In [54]:
X_df = pd.DataFrame(X.toarray())
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Bag Of Words (BOW)

In [55]:
from sklearn.feature_extraction.text import CountVectorizer


In [56]:
bow = CountVectorizer()
bow.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [57]:
X = bow.transform(corpus)
X_df = pd.DataFrame(X.toarray(), columns=sorted(bow.vocabulary_))
X_df

Unnamed: 0,be,cant,free,get,going,is,linkedin,lockdown,media,of,out,social,soon,the,to,uk,wait
0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,1,0,1
1,1,0,1,0,1,1,0,0,0,0,0,0,2,1,1,1,0
2,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0


#Term Frequency — Inverse Document Frequency (TF-IDF)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
tfidf = TfidfVectorizer()
tfidf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [60]:
X = tfidf.transform(corpus)
X_df = pd.DataFrame(X.toarray(), columns=sorted(tfidf.vocabulary_))
X_df

Unnamed: 0,be,cant,free,get,going,is,linkedin,lockdown,media,of,out,social,soon,the,to,uk,wait
0,0.0,0.389888,0.0,0.389888,0.0,0.0,0.0,0.389888,0.0,0.389888,0.389888,0.0,0.0,0.0,0.29652,0.0,0.389888
1,0.313777,0.0,0.313777,0.0,0.313777,0.238636,0.0,0.0,0.0,0.0,0.0,0.0,0.627555,0.313777,0.238636,0.313777,0.0
2,0.0,0.0,0.0,0.0,0.0,0.40204,0.528635,0.0,0.528635,0.0,0.0,0.528635,0.0,0.0,0.0,0.0,0.0
