# Representations

Adapted from Blueprints for Text Analytics

In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
sentences = ["It was the best of times,", 
             "it was the worst of times,", 
             "it was the age of wisdom,", 
             "it was the age of foolishness."]

In [3]:
tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

In [4]:
print (tokenized_sentences)

[['It', 'was', 'the', 'best', 'of', 'times,'], ['it', 'was', 'the', 'worst', 'of', 'times,'], ['it', 'was', 'the', 'age', 'of', 'wisdom,'], ['it', 'was', 'the', 'age', 'of', 'foolishness.']]


In [5]:
vocabulary = set([w for s in tokenized_sentences for w in s])

In [6]:
print (vocabulary)

{'it', 'the', 'best', 'It', 'age', 'of', 'wisdom,', 'foolishness.', 'times,', 'was', 'worst'}


In [7]:
# Display the "dictionary"
[[w, i] for i,w in enumerate(vocabulary)]

[['it', 0],
 ['the', 1],
 ['best', 2],
 ['It', 3],
 ['age', 4],
 ['of', 5],
 ['wisdom,', 6],
 ['foolishness.', 7],
 ['times,', 8],
 ['was', 9],
 ['worst', 10]]

## Problems with the initial tokenization
- capitalization
- periods and commas

In [8]:
matches = re.findall(r'[A-Za-z]+', "It was the best of times,")
print(matches)

['It', 'was', 'the', 'best', 'of', 'times']


In [9]:
tokenized_sentences = [re.findall(r'[A-Za-z]+',sentence) for sentence in sentences]

In [10]:
print (tokenized_sentences)

[['It', 'was', 'the', 'best', 'of', 'times'], ['it', 'was', 'the', 'worst', 'of', 'times'], ['it', 'was', 'the', 'age', 'of', 'wisdom'], ['it', 'was', 'the', 'age', 'of', 'foolishness']]


In [11]:
tokenized_sentences = [re.findall(r'\w+',sentence.lower()) for sentence in sentences]
print (tokenized_sentences)

[['it', 'was', 'the', 'best', 'of', 'times'], ['it', 'was', 'the', 'worst', 'of', 'times'], ['it', 'was', 'the', 'age', 'of', 'wisdom'], ['it', 'was', 'the', 'age', 'of', 'foolishness']]


In [12]:
vocabulary = set([w for s in tokenized_sentences for w in s])
print (vocabulary)

{'it', 'the', 'best', 'age', 'of', 'times', 'was', 'wisdom', 'worst', 'foolishness'}


In [13]:
# Display the "dictionary"
[[w, i] for i,w in enumerate(vocabulary)]

[['it', 0],
 ['the', 1],
 ['best', 2],
 ['age', 3],
 ['of', 4],
 ['times', 5],
 ['was', 6],
 ['wisdom', 7],
 ['worst', 8],
 ['foolishness', 9]]

## One hot encoding

In [14]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 1, 1, 0, 1, 1, 1, 0, 0, 0]: It was the best of times,
[1, 1, 0, 0, 1, 1, 1, 0, 1, 0]: it was the worst of times,
[1, 1, 0, 1, 1, 0, 1, 1, 0, 0]: it was the age of wisdom,
[1, 1, 0, 1, 1, 0, 1, 0, 0, 1]: it was the age of foolishness.


In [15]:
# Encoding a new document
onehot_encode("the age of wisdom is the best of times".split())

[0, 1, 1, 1, 1, 1, 0, 1, 0, 0]

In [16]:
# out of vocabulary
onehot_encode("John likes to watch movies Mary likes movies too".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [17]:
# document term matrix
df = pd.DataFrame(onehot,columns=vocabulary)
df

Unnamed: 0,it,the,best,age,of,times,was,wisdom,worst,foolishness
0,1,1,1,0,1,1,1,0,0,0
1,1,1,0,0,1,1,1,0,1,0
2,1,1,0,1,1,0,1,1,0,0
3,1,1,0,1,1,0,1,0,0,1


In [18]:
# Cosine Similarity 
dist_out = 1-pairwise_distances(df, metric="cosine")
dist_out

array([[1.        , 0.83333333, 0.66666667, 0.66666667],
       [0.83333333, 1.        , 0.66666667, 0.66666667],
       [0.66666667, 0.66666667, 1.        , 0.83333333],
       [0.66666667, 0.66666667, 0.83333333, 1.        ]])

In [19]:
np.dot(df, np.transpose(df))

array([[6, 5, 4, 4],
       [5, 6, 4, 4],
       [4, 4, 6, 5],
       [4, 4, 5, 6]])

Use sci-kit learn to perform one-hot encoding.
Warning:  this doesn't use the onehotencoder that's used for categorical data

In [20]:
lb = MultiLabelBinarizer()
lb.fit([vocabulary])
lb.transform(tokenized_sentences)

array([[0, 1, 0, 1, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [1, 0, 1, 1, 1, 1, 0, 1, 0, 0]])

## Count Vectorizer
Two phases:
- Learn the vocabulary
- transform documents to vectors

In [21]:

cv = CountVectorizer()

In [22]:
cv.fit(sentences)

CountVectorizer()

In [23]:
print (cv.get_feature_names())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


In [24]:
dt = cv.transform(sentences)
dt

<4x10 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [25]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,best,foolishness,it,of,the,times,was,wisdom,worst
0,0,1,0,1,1,1,1,1,0,0
1,0,0,0,1,1,1,1,1,0,1
2,1,0,0,1,1,1,0,1,1,0
3,1,0,1,1,1,1,0,1,0,0


In [26]:

cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [27]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3
0,1.0,0.833333,0.666667,0.666667
1,0.833333,1.0,0.666667,0.666667
2,0.666667,0.666667,1.0,0.833333
3,0.666667,0.666667,0.833333,1.0


In [28]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games and basketball games.",
                              "They both like games and movies."]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,"It was the best of times,"
1,"it was the worst of times,"
2,"it was the age of wisdom,"
3,it was the age of foolishness.
4,John likes to watch movies. Mary likes movies ...
5,Mary also likes to watch football games and ba...
6,They both like games and movies.


In [29]:
cv.fit(more_sentences)
print(cv.get_feature_names())
dt = cv.transform(more_sentences)
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

['age', 'also', 'and', 'basketball', 'best', 'both', 'foolishness', 'football', 'games', 'it', 'john', 'like', 'likes', 'mary', 'movies', 'of', 'the', 'they', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']


Unnamed: 0,age,also,and,basketball,best,both,foolishness,football,games,it,...,of,the,they,times,to,too,was,watch,wisdom,worst
0,0,0,0,0,1,0,0,0,0,1,...,1,1,0,1,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,1,0,1,0,0,1,0,0,1
2,1,0,0,0,0,0,0,0,0,1,...,1,1,0,0,0,0,1,0,1,0
3,1,0,0,0,0,0,1,0,0,1,...,1,1,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
5,0,1,1,1,0,0,0,1,2,0,...,0,0,0,0,1,0,0,1,0,0
6,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [30]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.833333,0.666667,0.666667,0.0,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.40032,0.226455
5,0.0,0.0,0.0,0.0,0.40032,1.0,0.353553
6,0.0,0.0,0.0,0.0,0.226455,0.353553,1.0


## TF - IDF
### Term Frequency
${tf} (t,d)={\frac {f_{t,d}}{\sum _{t'\in d}{f_{t',d}}}}$

$f_{t,d}$ is the raw count of a particular term in the document.

### Inverse Document Frequency
Measure of how much information a given word provides.  Punishes common words.

$ {idf}(t, D) =  \log \frac{N}{|\{d \in D: t \in d\}|} $

$ N $ total number of documents in the corpus

$ |\{d \in D: t \in d\}| $ number of documents in which the term appears

In [31]:

tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)

In [32]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,and,basketball,best,both,foolishness,football,games,it,...,of,the,they,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.0,0.0,0.55841,0.0,0.0,0.0,0.0,0.343991,...,0.343991,0.343991,0.0,0.463528,0.0,0.0,0.343991,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343991,...,0.343991,0.343991,0.0,0.463528,0.0,0.0,0.343991,0.0,0.0,0.55841
2,0.463528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343991,...,0.343991,0.343991,0.0,0.0,0.0,0.0,0.343991,0.0,0.55841,0.0
3,0.463528,0.0,0.0,0.0,0.0,0.0,0.55841,0.0,0.0,0.343991,...,0.343991,0.343991,0.0,0.0,0.0,0.0,0.343991,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.268196,0.323094,0.0,0.268196,0.0,0.0
5,0.0,0.329665,0.273651,0.329665,0.0,0.0,0.0,0.329665,0.547301,0.0,...,0.0,0.0,0.0,0.0,0.273651,0.0,0.0,0.273651,0.0,0.0
6,0.0,0.0,0.368759,0.0,0.0,0.444241,0.0,0.0,0.368759,0.0,...,0.0,0.0,0.444241,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.688178,0.47332,0.47332,0.0,0.0,0.0
1,0.688178,1.0,0.47332,0.47332,0.0,0.0,0.0
2,0.47332,0.47332,1.0,0.688178,0.0,0.0,0.0
3,0.47332,0.47332,0.688178,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.36696,0.197799
5,0.0,0.0,0.0,0.0,0.36696,1.0,0.302733
6,0.0,0.0,0.0,0.0,0.197799,0.302733,1.0


## TF-IDF with n-grams

In [34]:
tfidf = TfidfVectorizer(ngram_range=(1,2))
dt = tfidf.fit_transform(more_sentences)
print(dt.shape)

(7, 55)


In [35]:
pd.DataFrame(dt.toarray(), columns=tfidf.get_feature_names())

Unnamed: 0,age,age of,also,also likes,and,and basketball,and movies,basketball,basketball games,best,...,to watch,too,was,was the,watch,watch football,watch movies,wisdom,worst,worst of
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.387639,...,0.0,0.0,0.238793,0.238793,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.238793,0.238793,0.0,0.0,0.0,0.0,0.387639,0.387639
2,0.329565,0.329565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.244575,0.244575,0.0,0.0,0.0,0.397025,0.0,0.0
3,0.329565,0.329565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.244575,0.244575,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.201577,0.242839,0.0,0.0,0.201577,0.0,0.242839,0.0,0.0,0.0
5,0.0,0.0,0.240643,0.240643,0.199754,0.240643,0.0,0.240643,0.240643,0.0,...,0.199754,0.0,0.0,0.0,0.199754,0.240643,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.265756,0.0,0.320155,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
tfidf = TfidfVectorizer(ngram_range=(1,3))
dt = tfidf.fit_transform(more_sentences)
print(dt.shape)
pd.DataFrame(dt.toarray(), columns=tfidf.get_feature_names())

(7, 84)


Unnamed: 0,age,age of,age of foolishness,age of wisdom,also,also likes,also likes to,and,and basketball,and basketball games,...,was the worst,watch,watch football,watch football games,watch movies,watch movies mary,wisdom,worst,worst of,worst of times
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.315685,0.0,0.0,0.0,0.0,0.0,0.0,0.315685,0.315685,0.315685
2,0.275147,0.275147,0.0,0.331468,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.331468,0.0,0.0,0.0
3,0.275147,0.275147,0.331468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.170702,0.0,0.0,0.205644,0.205644,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.20017,0.20017,0.20017,0.166158,0.20017,0.20017,...,0.0,0.166158,0.20017,0.20017,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223807,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
tfidf = TfidfVectorizer(ngram_range=(2,3), min_df=2)
dt = tfidf.fit_transform(more_sentences)
print(dt.shape)
pd.DataFrame(dt.toarray(), columns=tfidf.get_feature_names())

(7, 12)


Unnamed: 0,age of,games and,it was,it was the,likes to,likes to watch,of times,the age,the age of,to watch,was the,was the age
0,0.0,0.0,0.455688,0.455688,0.0,0.0,0.61404,0.0,0.0,0.0,0.455688,0.0
1,0.0,0.0,0.455688,0.455688,0.0,0.0,0.61404,0.0,0.0,0.0,0.455688,0.0
2,0.420621,0.0,0.312149,0.312149,0.0,0.0,0.0,0.420621,0.420621,0.0,0.312149,0.420621
3,0.420621,0.0,0.312149,0.312149,0.0,0.0,0.0,0.420621,0.420621,0.0,0.312149,0.420621
4,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0
5,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.5,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
