# Sklearn Text
Demo of text processing functionality in sklearn

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

### Create messages
List of 3 messages

In [None]:
#dataset
list_message = ["Call me soon", "CALL to win", "Pick me up soon"]

### Process text: using sklearn CountVectorizer

In [None]:
# Create instance of count vectorizer
vectorizer = CountVectorizer()
# Perform word counts
Xfit = vectorizer.fit_transform(list_message)
# Generate feature matrix (transpose so sample axis is in column direction)
X = Xfit.toarray().T
print("X: \n{}".format(X))
# list words in vocabulary (turn into numpy array)
words = np.array(vectorizer.get_feature_names())
print("words: {}".format(words))

In [None]:
# Count number of times each word appears in all messages
# sum in column direction
word_count = np.sum(X,axis=1)
for i,word in enumerate(words):
    print("{}  \t count: {}".format(word,word_count[i]))

### Word Cloud
Create a word cloud using "frequencies" approach

In [None]:
# create a dictionary key = word and value = word_count for each word
dict_freq = {words[i]:word_count[i] for i in range(np.size(words))}
print("dictionary: {}".format(dict_freq))

In [None]:
# use generate_from_frequencies approach for WordCloud
wc = WordCloud(background_color="white",width=1000,height=600, random_state=11).generate_from_frequencies(dict_freq)
plt.figure()
plt.imshow(wc)
plt.show()

### Process text: using sklearn TfidfVectorizer

In [None]:
# Create instance of vectorizer
vectorizer_tfidf = TfidfVectorizer()
# Perform tfidf calculation
Xfit_tfidf = vectorizer_tfidf.fit_transform(list_message)
# Generate feature matrix (transpose so sample axis is in column direction)
X_tfidf = Xfit_tfidf.toarray().T
print("X_tfidf: \n{}".format(X_tfidf))
# list words in vocabulary (turn into numpy array)
words_tfidf = np.array(vectorizer_tfidf.get_feature_names())
print("Words: {}".format(words_tfidf))

### Generate Word Cloud
Use tfidf data

In [None]:
# Sum in the column direction
word_weight_tfidf = np.sum(X_tfidf,axis=1)
for i,word in enumerate(words_tfidf):
    print("{}  \t weight: {}".format(word,word_weight_tfidf[i]))

In [None]:
# create a dictionary key = word and value = word_weight for each word
dict_freq_tfidf = {words_tfidf[i]:word_weight_tfidf[i] for i in range(np.size(words_tfidf))}
print("dictionary: {}".format(dict_freq_tfidf))

In [None]:
# use generate_from_frequencies approach for WordCloud
wc = WordCloud(background_color="white",width=1000,height=600, random_state=11).generate_from_frequencies(dict_freq_tfidf)
plt.figure()
plt.imshow(wc)
plt.show()