# Sklearn Text
Demo of text processing functionality in sklearn

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

### Create messages
List of 3 messages

In [None]:
#dataset
list_message = ["Call me soon", "CALL to win", "Pick me up soon"]

### Process text: using sklearn CountVectorizer

In [None]:
# Create instance of count vectorizer
vectorizer = CountVectorizer()
# Perform word counts
Xfit = vectorizer.fit_transform(list_message)
# Generate feature matrix (transform so sample axis is along columns)
X = Xfit.toarray().T
print("X: \n{}".format(X))
# list words in vocabulary (turn into numpy array)
words = np.array(vectorizer.get_feature_names())
print("Words: {}".format(words))

### Process text: using sklearn TfidfVectorizer

In [None]:
# Create instance of vectorizer
vectorizer_tfidf = TfidfVectorizer()
# Perform word counts
Xfit_tfidf = vectorizer_tfidf.fit_transform(list_message)
# Generate feature matrix (transform so sample axis is along columns)
X_tfidf = Xfit_tfidf.toarray().T
print("X_tfidf: \n{}".format(X_tfidf))
# list words in vocabulary (turn into numpy array)
words_tfidf = np.array(vectorizer_tfidf.get_feature_names())
print("Words: {}".format(words_tfidf))

### How is tfidf matrix calculated?

In [None]:
# Term frequency matrix is X produced by CountVectorizer
# Document frequency is number of documents in which a word appears
print("Term Frequency: \n{}".format(X))
df = np.sum(X>0,axis=1,keepdims=True)
print("Document Frequency: \n{}".format(df))

In [None]:
# Inverse Document Frequency
ndoc = len(list_message)
idf = np.log((ndoc+1)/(df+1))+1
print("idf: \n{}".format(idf))

In [None]:
# tfidf
tfidf = X*idf
# scale so each column has length 1
tfidf_norm = np.sqrt(np.sum(np.square(tfidf),axis=0,keepdims=0))
tfidf = tfidf/tfidf_norm
print("X_tfidf using sklearn: \n{}".format(X_tfidf))
print("Computed tfidf \n{}".format(tfidf))