# TextBlob

---
# 1. Tokenization

In [3]:
from textblob import TextBlob
my_text = TextBlob("We're moving from NLTK to TextBlob. How fun!")
my_text.words

WordList(['We', "'re", 'moving', 'from', 'NLTK', 'to', 'TextBlob', 'How', 'fun'])

# 2. Spell Check

In [4]:
blob = TextBlob("I'm graat at speling.")
print(blob.correct())

I'm great at spelling.


---
# Cosine Similarity

In [7]:
from numpy import dot
from numpy.linalg import norm
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
cosine([1, 1, 1, 0], [1, 1, 0, 1])

0.6666666666666667

---
# Count Vectorizer

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.',
'This is the second document.',
'And the third one. One is fun.']
cv = CountVectorizer()
X = cv.fit_transform(corpus)
pd.DataFrame(X.toarray(),columns=cv.get_feature_names())


Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0,1,1,0,1,0,0,1,0,1
1,0,1,0,0,1,0,1,1,0,1
2,1,0,0,1,1,2,0,1,1,0


# TF-IDF

In [19]:
import pandas as pd
corpus = ['This is the first document.',
'This is the second document.',
'And the third one. One is fun.']
# new TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf = TfidfVectorizer()
X_tfidf = cv_tfidf.fit_transform(corpus).toarray()
pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())


Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0.0,0.450145,0.591887,0.0,0.349578,0.0,0.0,0.349578,0.0,0.450145
1,0.0,0.450145,0.0,0.0,0.349578,0.0,0.591887,0.349578,0.0,0.450145
2,0.36043,0.0,0.0,0.36043,0.212876,0.72086,0.0,0.212876,0.36043,0.0


# Count Vectorizer VS TF-IDF

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['The weather is hot under the sun',
'One hot encoding',
'I will have a chai latte with milk',
'There is a hot sale today']
# create the document-term matrix with count vectorizer
cv = CountVectorizer(stop_words="english")
X = cv.fit_transform(corpus).toarray()
dt = pd.DataFrame(X, columns=cv.get_feature_names())

from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
# list all of the combinations of 5 take 2 as well as the pairs of phrases
pairs = list(combinations(range(len(corpus)),2))
combos = [(corpus[a_index], corpus[b_index]) for (a_index, b_index) in pairs]
# calculate the cosine similarity for all pairs of phrases and sort by most similar
results = [cosine_similarity([X[a_index]], [X[b_index]]) for (a_index, b_index) in
pairs]
sorted(zip(results, combos), reverse=True)

[(array([[0.40824829]]),
  ('The weather is hot under the sun', 'One hot encoding')),
 (array([[0.40824829]]), ('One hot encoding', 'There is a hot sale today')),
 (array([[0.33333333]]),
  ('The weather is hot under the sun', 'There is a hot sale today')),
 (array([[0.]]),
  ('The weather is hot under the sun', 'I will have a chai latte with milk')),
 (array([[0.]]), ('One hot encoding', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('I will have a chai latte with milk', 'There is a hot sale today'))]

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
# create the document-term matrix with TF-IDF vectorizer
cv_tfidf = TfidfVectorizer(stop_words="english")
X_tfidf = cv_tfidf.fit_transform(corpus).toarray()
dt_tfidf = pd.DataFrame(X_tfidf,columns=cv_tfidf.get_feature_names())
dt_tfidf

Unnamed: 0,chai,encoding,hot,latte,milk,sale,sun,today,weather
0,0.0,0.0,0.411378,0.0,0.0,0.0,0.644503,0.0,0.644503
1,0.0,0.842926,0.538029,0.0,0.0,0.0,0.0,0.0,0.0
2,0.57735,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0
3,0.0,0.0,0.411378,0.0,0.0,0.644503,0.0,0.644503,0.0
