# Text Similarity Measures

### TextBlob Demo

###### Tokenization

In [1]:
from textblob import TextBlob

In [3]:
my_text = TextBlob("We're moving from NLTK to TextBlob. How fun!")

print(my_text.words)

['We', "'re", 'moving', 'from', 'NLTK', 'to', 'TextBlob', 'How', 'fun']


###### Spell Check

In [9]:
blob = TextBlob("I'm graat at writig.")

print(blob.correct())

I'm great at writing.


### Text Format for Analysis: Count Vectorizer

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
corpus = ['This is the first document.',
          'This is the second document.', 
          'And the third one. one is fun.' 
         ]

In [35]:
cv = CountVectorizer()
X = cv.fit_transform(corpus)

pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0,1,1,0,1,0,0,1,0,1
1,0,1,0,0,1,0,1,1,0,1
2,1,0,0,1,1,2,0,1,1,0


### Document Similarity

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
corpus = ['The weather is hot under the sun',
          'I make my hot chocolate with milk',
          'One hot encoding',
          'I will have a chai latte with milk', 
          'There is a hot sale today']

In [48]:
# create the document-term matrix with count vectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(corpus).toarray()

df = pd.DataFrame(X, columns=cv.get_feature_names())
df

Unnamed: 0,chai,chocolate,encoding,hot,latte,make,milk,sale,sun,today,weather
0,0,0,0,1,0,0,0,0,1,0,1
1,0,1,0,1,0,1,1,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,1,0


In [49]:
# calculate the cosine similarity between all combinations of documents
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
pairs = list(combinations(range(len(corpus)),2))
combos = [(corpus[a_index], corpus[b_index]) for (a_index, b_index) in pairs]
results = [cosine_similarity([X[a_index]], [X[b_index]]) for (a_index, b_index) in pairs]

sorted(zip(results, combos), reverse=True)

[(array([[0.40824829]]),
  ('The weather is hot under the sun', 'One hot encoding')),
 (array([[0.40824829]]), ('One hot encoding', 'There is a hot sale today')),
 (array([[0.35355339]]),
  ('I make my hot chocolate with milk', 'One hot encoding')),
 (array([[0.33333333]]),
  ('The weather is hot under the sun', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('The weather is hot under the sun', 'I make my hot chocolate with milk')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('The weather is hot under the sun', 'I will have a chai latte with milk')),
 (array([[0.]]), ('One hot encoding', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('I will have a chai latte with milk', 'There is a hot sale today'))]

### (TF-IDF)  Term Frequency-Inverse Document Frequency

### Count Vectorizer vs TF-IDF Vectorizer

In [70]:
corpus = ['This is the first document.',
          'This is the second document.', 
          'And the third one. one is fun.' 
         ]

In [73]:
# original Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

pd.DataFrame(X, columns=cv.get_feature_names())

Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0,1,1,0,1,0,0,1,0,1
1,0,1,0,0,1,0,1,1,0,1
2,1,0,0,1,1,2,0,1,1,0


In [77]:
# new TF-IDF Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf = TfidfVectorizer()
X_tfidf = cv_tfidf.fit_transform(corpus).toarray()

pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())

Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0.0,0.450145,0.591887,0.0,0.349578,0.0,0.0,0.349578,0.0,0.450145
1,0.0,0.450145,0.0,0.0,0.349578,0.0,0.591887,0.349578,0.0,0.450145
2,0.36043,0.0,0.0,0.36043,0.212876,0.72086,0.0,0.212876,0.36043,0.0


#### Document Similarity: Example with TF-IDF

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['The weather is hot under the sun',
'I make my hot chocolate with milk',
'One hot encoding',
'I will have a chai latte with milk',
'There is a hot sale today']

In [96]:
# create the document-term matrix with TF-IDF vectorizer
cv_tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = cv_tfidf.fit_transform(corpus).toarray()

df_tfidf = pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())
df_tfidf

Unnamed: 0,chai,chocolate,encoding,hot,latte,make,milk,sale,sun,today,weather
0,0.0,0.0,0.0,0.370086,0.0,0.0,0.0,0.0,0.6569,0.0,0.6569
1,0.0,0.580423,0.0,0.327,0.0,0.580423,0.468282,0.0,0.0,0.0,0.0
2,0.0,0.0,0.871247,0.490845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.614189,0.0,0.0,0.0,0.614189,0.0,0.495524,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.370086,0.0,0.0,0.0,0.6569,0.0,0.6569,0.0


In [89]:
pairs = list(combinations(range(len(corpus)),2))
combos = [(corpus[a_index], corpus[b_index]) for (a_index, b_index) in pairs]

In [97]:
# calculate the cosine similarity for all pairs of phrases and sort by most similar
results_tfidf = [cosine_similarity(X_tfidf[a_index], X_tfidf[b_index]) for (a_index, b_index) in pairs]

sorted(zip(results_tfidf, combos), reverse=True)

ValueError: Expected 2D array, got 1D array instead:
array=[0.         0.         0.         0.37008621 0.         0.
 0.         0.         0.65690037 0.         0.65690037].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.