# One hot encoding


In [0]:
from sklearn.preprocessing import OneHotEncoder


In [0]:
data=[['b'],['a'],['c']]
#data should be a list of a list

In [0]:
enc.fit(data)
x=enc.transform(data).toarray()
print(x)

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


# Bag of words

In [0]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.', 'Is this the first document?']

In [0]:
print(corpus)

['This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?']


In [0]:
from sklearn.feature_extraction.text import CountVectorizer


In [0]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [0]:
vocabulary = vectorizer.get_feature_names()
print(vocabulary)
print(len(vocabulary))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
9


## Difference between fit and fit_transform

In [0]:
new_vec = vectorizer.transform(['This is the last document']).toarray()
print(new_vec)
print("\n")
new_vec = vectorizer.fit_transform(['This is the last document']).toarray()
print(new_vec[0])
print(len(new_vec[0]))

[[0 1 0 1 0 0 1 0 1]]


[1 1 1 1 1]
5


## Character - level

In [0]:
vectorizer = CountVectorizer(analyzer = 'char')
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[4 1 0 0 1 1 2 1 2 3 1 1 1 1 3 4 1]
 [5 1 0 0 3 3 4 0 2 2 2 3 3 0 3 4 2]
 [5 1 0 1 0 2 2 0 3 3 0 2 1 1 2 3 0]
 [4 0 1 0 1 1 2 1 2 3 1 1 1 1 3 4 1]]


In [0]:
vocabulary = vectorizer.get_feature_names()
print(vocabulary)
print(len(vocabulary))

[' ', '.', '?', 'a', 'c', 'd', 'e', 'f', 'h', 'i', 'm', 'n', 'o', 'r', 's', 't', 'u']
17


# TDM 

In [0]:
vectorizer = CountVectorizer(ngram_range = (1,1))
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [0]:
vocabulary = vectorizer.get_feature_names()
print(vocabulary)
print(len(vocabulary))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
9


# TF-IDF

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
#corpus1=['This is a sample. ','This is another sample']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [0]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


# TDM+SVD

In [0]:
from scipy.linalg import svd

from sklearn.feature_extraction.text import CountVectorizer

In [0]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.', 'Is this the first document?']

In [0]:
vectorizer = CountVectorizer(ngram_range = (1,1))
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [0]:
U, s, VT = svd(X)

In [0]:
print(U)
print(s)
print(VT)

[[-4.97749218e-01  9.95291395e-02 -4.92280069e-01 -7.07106781e-01]
 [-5.95251327e-01  4.17856560e-01  6.86346671e-01 -5.55111512e-17]
 [-3.87514244e-01 -8.97543200e-01  2.10354260e-01 -2.77555756e-17]
 [-4.97749218e-01  9.95291395e-02 -4.92280069e-01  7.07106781e-01]]
[4.27960622 1.98443654 1.32173457 0.        ]
[[-0.09054904 -0.51079491 -0.23261449 -0.46225375 -0.09054904 -0.13909021
  -0.46225375 -0.09054904 -0.46225375]
 [-0.45229121  0.52144343  0.10030972 -0.14141463 -0.45229121  0.21056685
  -0.14141463 -0.45229121 -0.14141463]
 [ 0.15915015  0.29365443 -0.74490004 -0.06647266  0.15915015  0.51927723
  -0.06647266  0.15915015 -0.06647266]
 [-0.43230491 -0.20809863 -0.21096578  0.8063548   0.00662025 -0.00286714
  -0.1936452   0.00662025 -0.1936452 ]
 [-0.3157711   0.17153177  0.23225887 -0.13459688  0.85978087  0.0607271
  -0.13459688 -0.14021913 -0.13459688]
 [ 0.04265433 -0.4383959   0.37355634  0.02161318 -0.05374694  0.81195224
   0.02161318 -0.05374694  0.02161318]
 [-0.432

# TDM+Non-negative matrix factorization (NMF)

In [0]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.', 'Is this the first document?']

In [0]:
vectorizer = CountVectorizer(ngram_range = (1,1))
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [0]:
model = NMF(n_components=2, init='random', random_state=0)

In [0]:
W = model.fit_transform(X)
H = model.components_

In [0]:
print(W)

[[0.5107616  0.12369935]
 [0.67199958 0.        ]
 [0.         1.43183731]
 [0.5107616  0.12369935]]


In [0]:
print(H)

[[0.         2.43031889 1.04228305 1.6481337  0.         0.69040703
  1.6481337  0.         1.6481337 ]
 [0.68813151 0.         0.05560176 0.7069408  0.68813151 0.
  0.7069408  0.68813151 0.7069408 ]]


# TF-IDF+SVD

In [0]:
from scipy.linalg import svd
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.', 'Is this the first document?']

In [0]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [0]:
U, s, VT = svd(X)

In [0]:
print(U)
print(s)
print(VT)

[[-5.85321573e-01  1.45097929e-01 -3.69249573e-01 -7.07106781e-01]
 [-4.86814567e-01  2.00045615e-01  8.50290144e-01  5.55111512e-17]
 [-2.78942449e-01 -9.58057897e-01  6.56976138e-02 -6.93889390e-17]
 [-5.85321573e-01  1.45097929e-01 -3.69249573e-01  7.07106781e-01]]
[1.63851261e+00 9.27224412e-01 6.74930598e-01 1.54149002e-16]
[[-0.08713773 -0.5399429  -0.41458797 -0.4033972  -0.08713773 -0.16003631
  -0.4033972  -0.08713773 -0.4033972 ]
 [-0.52886928  0.29538437  0.18161358 -0.09513381 -0.52886928  0.11621145
  -0.09513381 -0.52886928 -0.09513381]
 [ 0.04982324  0.35224219 -0.63494022 -0.04013985  0.04982324  0.67859831
  -0.04013985  0.04982324 -0.04013985]
 [ 0.83081502  0.14524846  0.19854795 -0.15921035 -0.29078366  0.06382728
  -0.15921035 -0.29078366 -0.15921035]
 [-0.0931515   0.20959157  0.31005025 -0.42922999  0.73568913  0.1106676
  -0.14778122 -0.26431087 -0.14778122]
 [-0.02742269 -0.60413046  0.38212981  0.11772467 -0.02845521  0.68688418
   0.02194105 -0.02845521  0.02

# TF-IDF+NMF

In [0]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.', 'Is this the first document?']

In [0]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [0]:
model = NMF(n_components=2, init='random', random_state=0)

In [0]:
W = model.fit_transform(X)
H = model.components_

In [0]:
print(W)

[[0.00796847 0.80665869]
 [0.         0.68047828]
 [0.59254741 0.        ]
 [0.00796847 0.80665869]]


In [0]:
print(H)

[[0.86349791 0.         0.00691154 0.45133488 0.86349791 0.
  0.45133488 0.86349791 0.45133488]
 [0.         0.69474199 0.53053238 0.45630378 0.         0.20773533
  0.45630378 0.         0.45630378]]


 # N-gram

In [0]:
vectorizer = CountVectorizer(ngram_range = (1,2))
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0]
 [0 0 2 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0]
 [1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0]
 [0 0 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1]]


In [0]:
vocabulary = vectorizer.get_feature_names()
print(vocabulary)
print(len(vocabulary))

['and', 'and this', 'document', 'document is', 'first', 'first document', 'is', 'is the', 'is this', 'one', 'second', 'second document', 'the', 'the first', 'the second', 'the third', 'third', 'third one', 'this', 'this document', 'this is', 'this the']
22


## char-level

In [0]:
vectorizer = CountVectorizer(ngram_range = (1,2), analyzer = 'char')
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[4 1 1 1 0 0 1 1 0 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1 2 1 1 3 1 2 1 1 1 0 0 1
  1 1 0 1 0 1 3 2 0 1 4 1 1 0 2 1 1]
 [5 2 0 1 0 1 1 1 0 0 0 3 1 2 3 1 2 4 1 0 1 2 0 0 2 1 1 2 0 2 2 2 3 1 0 2
  3 2 1 0 0 0 3 2 1 0 4 1 1 0 2 2 2]
 [5 0 0 1 1 0 3 1 0 1 1 0 0 0 2 2 0 2 1 1 0 0 0 0 3 1 2 3 1 2 0 0 2 1 1 0
  1 0 1 1 1 0 2 2 0 0 3 0 0 0 3 0 0]
 [4 1 1 0 0 0 2 0 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1 2 1 1 3 1 2 1 1 1 0 0 1
  1 1 0 1 0 1 3 2 0 1 4 1 0 1 2 1 1]]


In [0]:
vocabulary = vectorizer.get_feature_names()
print(vocabulary)
print(len(vocabulary))

[' ', ' d', ' f', ' i', ' o', ' s', ' t', '.', '?', 'a', 'an', 'c', 'co', 'cu', 'd', 'd ', 'do', 'e', 'e ', 'e.', 'ec', 'en', 'f', 'fi', 'h', 'he', 'hi', 'i', 'ir', 'is', 'm', 'me', 'n', 'nd', 'ne', 'nt', 'o', 'oc', 'on', 'r', 'rd', 'rs', 's', 's ', 'se', 'st', 't', 't ', 't.', 't?', 'th', 'u', 'um']
53


# word2vec

In [0]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [0]:
data = 'This is the first document. This is the second second document. And the third one. Is this the first document?'

In [0]:
tokenized_data = []
for d in data:
    tokenized_data.append(word_tokenize(data))

In [0]:
model = Word2Vec(tokenized_data, size=5, window=5, min_count=1, workers=4)

In [0]:
model.wv.vocab

{'.': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a828>,
 '?': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a9b0>,
 'And': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a898>,
 'Is': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a940>,
 'This': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a6d8>,
 'document': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a710>,
 'first': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a780>,
 'is': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a7b8>,
 'one': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a908>,
 'second': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a860>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a748>,
 'third': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a8d0>,
 'this': <gensim.models.keyedvectors.Vocab at 0x7f3359f6a978>}

In [0]:
model.wv["This"]

array([ 0.4664309 ,  0.61515117, -0.16935559,  0.87751323, -0.54263115],
      dtype=float32)

In [0]:
model.wv["this is"]
#model.infer_vector(["this is"])

KeyError: ignored

In [0]:
model.wv["Th"]

KeyError: ignored

# Fasttext

In [0]:
from gensim.models import FastText

In [0]:
model = FastText(size=4, window=3, min_count=1) 
model.build_vocab(sentences=tokenized_data)
model.train(sentences=tokenized_data, total_examples=len(tokenized_data), epochs=10)

In [0]:
model.wv["This"]

array([-0.09336837,  1.106698  , -0.9921883 ,  0.3000509 ], dtype=float32)

In [0]:
model.wv["This is"]

array([-0.11472034,  1.1140894 , -0.97730035,  0.22226869], dtype=float32)

In [0]:
model.wv["Th"]

array([-0.1979229 ,  0.7348152 , -0.7987169 ,  0.00813393], dtype=float32)

# Doc2vec

In [0]:

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
data = ['This is the first document.',' This is the second second document.','And the third one.',' Is this the first document?']

In [0]:
corpus = []
for i in range(len(data)):
    corpus.append(word_tokenize(data[i]))

In [0]:
corpus

[['This', 'is', 'the', 'first', 'document', '.'],
 ['This', 'is', 'the', 'second', 'second', 'document', '.'],
 ['And', 'the', 'third', 'one', '.'],
 ['Is', 'this', 'the', 'first', 'document', '?']]

In [0]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [0]:
model.infer_vector(["this is"])

array([0.0348769 , 0.07072723, 0.08233352, 0.0162783 , 0.04638267],
      dtype=float32)

In [0]:
model.infer_vector(["this"])

array([-0.06487391, -0.03116014,  0.09166626,  0.05343613,  0.06687652],
      dtype=float32)

In [0]:
model.wv.vocab

{'.': <gensim.models.keyedvectors.Vocab at 0x7f335c5edd68>,
 '?': <gensim.models.keyedvectors.Vocab at 0x7f335c5edef0>,
 'And': <gensim.models.keyedvectors.Vocab at 0x7f335c5eddd8>,
 'Is': <gensim.models.keyedvectors.Vocab at 0x7f335c5ede80>,
 'This': <gensim.models.keyedvectors.Vocab at 0x7f335c5edc50>,
 'document': <gensim.models.keyedvectors.Vocab at 0x7f335c5edd30>,
 'first': <gensim.models.keyedvectors.Vocab at 0x7f335c5edcf8>,
 'is': <gensim.models.keyedvectors.Vocab at 0x7f335c5edc88>,
 'one': <gensim.models.keyedvectors.Vocab at 0x7f335c5ede48>,
 'second': <gensim.models.keyedvectors.Vocab at 0x7f335c5edda0>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f335c5edcc0>,
 'third': <gensim.models.keyedvectors.Vocab at 0x7f335c5ede10>,
 'this': <gensim.models.keyedvectors.Vocab at 0x7f335c5edeb8>}