## Text

In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
import re
import nltk

%matplotlib inline

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jbslanka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Tests to see if we are running in the Google Colaboratory environment
# If so, use an https URL to access the data.  Otherwise, load via the file path
try:
  import google.colab
  data_file_prefix = "https://raw.githubusercontent.com/slankas/DataScienceNotebooks/master//FeatureCreation/"
  import plotly.io as pio
  pio.renderers.default = 'colab'
except:
  data_file_prefix = ""

In [4]:
df = pd.read_csv(data_file_prefix+"data/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Ci..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 t...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [5]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [7]:
df['cleanText'] = df['Message'].apply(lambda x: normalize_document(x))
df.head()

Unnamed: 0,Category,Message,cleanText
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Ci...",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 t...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goes usf lives around though


In [8]:
import collections
results = collections.Counter()
df['cleanText'].str.split().apply(results.update)
for word, count in results.most_common(10):
    print(word,": ",count)

u :  1130
call :  575
2 :  482
im :  473
ur :  390
get :  386
dont :  298
4 :  293
go :  281
ok :  278


# Bag of Words Model


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(df['cleanText'])
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Bag of N-Grams Model

In [11]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(df['cleanText'])
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,008704050406 sp,0089my last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,zed pobox,zeros savings,zhong se,zindgi wo,zoe 18,zoe hit,zogtorius ive,zoom cine,zouk nichols,zyada kisi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# TF - IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(df['cleanText'])
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# document similary via cosine

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5562,5563,5564,5565,5566,5567,5568,5569,5570,5571
0,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.036959,0.0,0.0,0.0,0.000000,0.0
1,0.0,1.0,0.000000,0.0,0.0,0.040733,0.000000,0.000000,0.000000,0.000000,...,0.047576,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.034093,...,0.000000,0.000000,0.000000,0.0,0.039992,0.0,0.0,0.0,0.020001,0.0
3,0.0,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.137548,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.060819,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.035963,0.117038,0.015250,...,0.000000,0.000000,0.000000,0.0,0.016760,1.0,0.0,0.0,0.000000,0.0
5568,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,1.0,0.0,0.000000,0.0
5569,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,1.0,0.000000,0.0
5570,0.0,0.0,0.020001,0.0,0.0,0.083140,0.073823,0.000000,0.000000,0.051458,...,0.000000,0.066952,0.000000,0.0,0.028277,0.0,0.0,0.0,1.000000,0.0


In [16]:
# cluster via document similarity

In [17]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=25)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([df, cluster_labels], axis=1)

Unnamed: 0,Category,Message,cleanText,ClusterLabel
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Ci...",go jurong point crazy available bugis n great world la e buffet cine got amore wat,15
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,4
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 t...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry...,11
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,15
4,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goes usf lives around though,21
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 cla...,2nd time tried 2 contact u u 750 pound prize 2 claim easy call 087187272008 now1 10p p...,9
5568,ham,Will ü b going to esplanade fr home?,b going esplanade fr home,17
5569,ham,"Pity, * was in mood for that. So...any other suggestions?",pity mood soany suggestions,15
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else ...,guy bitching acted like id interested buying something else next week gave us free,10


In [18]:
# Bi-grams

In [19]:
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
x2 = bigram_converter.fit_transform(df['cleanText'])

In [20]:
bigrams = bigram_converter.get_feature_names()
print(len(bigrams))
bigrams[-10:]

33372


['zed pobox',
 'zeros savings',
 'zhong se',
 'zindgi wo',
 'zoe 18',
 'zoe hit',
 'zogtorius ive',
 'zoom cine',
 'zouk nichols',
 'zyada kisi']

## Bag of Words and Similarity Scoring Example
Sample corpus and code from https://stackoverflow.com/questions/8897593/how-to-compute-the-similarity-between-two-text-documents

$$ tfidf( t, d, D ) = tf( t, d ) \times idf( t, D ) $$

_t_ denotes a specific term\
_d_ denotes a specific document\
_D_ is the collection of documents\

$$idf( t, D ) = log \frac{ \text{| } D \text{ |} }{ 1 + \text{| } \{ d \in D : t \in d \} \text{ |} }$$

Term frequency simply counts the number of times each word appears in a document.\

The inverse document frequency applies a weight to reduce the impact(significance) of words that appear in multiple documents.\
So, if we have 100 documents: 
* if a word appears in just one of them, then idf = 2
* if a  word appears in 10 of them, idf = 1
* if a given word in appears in all of them, idf = 0.
Intuitively, as a term appears in more documents, the ratio become closer to 1, and hence the IDF value towards zero, and thus the overall tf-idf value goes toward zero.  

The "1+" handles situations in which the word doesn't appear at all.


In [21]:
corpus = ["I'd like an apple", 
          "An apple a day keeps the doctor away", 
          "Never compare an apple to an orange", 
          "I prefer scikit-learn to Orange", 
          "The scikit-learn docs are Orange and Blue",
          "an orange apple is better than a blue apple"]                                                                                                                                                                                                   


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer(min_df=1, stop_words="english")
tf = countVectorizer.fit_transform(corpus)
tfDF = pd.DataFrame.sparse.from_spmatrix(tf)
tfDF.columns= countVectorizer.get_feature_names()
tfDF

Unnamed: 0,apple,away,better,blue,compare,day,docs,doctor,keeps,learn,like,orange,prefer,scikit
0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,1,0,1,1,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,1,0,1,1,1
4,0,0,0,1,0,0,1,0,0,1,0,1,0,1
5,2,0,1,1,0,0,0,0,0,0,0,1,0,0


In [23]:
tfidfVectorizer = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
tfidf = tfidfVectorizer.fit_transform(corpus)                                                                                                                                                                                                                       
df = pd.DataFrame.sparse.from_spmatrix(tfidf)
df.columns = tfidfVectorizer.get_feature_names()
df

Unnamed: 0,apple,away,better,blue,compare,day,docs,doctor,keeps,learn,like,orange,prefer,scikit
0,0.510227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.86004,0.0,0.0,0.0
1,0.284382,0.479356,0.0,0.0,0.0,0.479356,0.0,0.479356,0.479356,0.0,0.0,0.0,0.0,0.0
2,0.454486,0.0,0.0,0.0,0.766084,0.0,0.0,0.0,0.0,0.0,0.0,0.454486,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49934,0.0,0.36126,0.608941,0.49934
4,0.0,0.0,0.0,0.446742,0.0,0.0,0.544797,0.0,0.0,0.446742,0.0,0.323206,0.0,0.446742
5,0.640454,0.0,0.539776,0.442624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320227,0.0,0.0


In [24]:
tf_pairwise_similarity = tf * tf.T  
tfSimilarity = tf_pairwise_similarity.toarray()     
print(tfSimilarity)
np.fill_diagonal(tfSimilarity, 0.0) 

[[2 1 1 0 0 2]
 [1 5 1 0 0 2]
 [1 1 3 1 1 3]
 [0 0 1 4 3 1]
 [0 0 1 3 5 2]
 [2 2 3 1 2 7]]


In [25]:
tf_idf_pairwise_similarity = tfidf * tfidf.T  
tfIDFSimilarity = tf_idf_pairwise_similarity.toarray()     
print(tfIDFSimilarity)
np.fill_diagonal(tfIDFSimilarity, np.nan) 

[[1.         0.14509928 0.23189098 0.         0.         0.32677656]
 [0.14509928 1.         0.12924773 0.         0.         0.18213356]
 [0.23189098 0.12924773 1.         0.16418768 0.14689265 0.43661616]
 [0.         0.         0.16418768 1.         0.56291361 0.11568515]
 [0.         0.         0.14689265 0.56291361 1.         0.30123766]
 [0.32677656 0.18213356 0.43661616 0.11568515 0.30123766 1.        ]]


In [26]:
result_idx = np.nanargmax(tfIDFSimilarity[4])                                                                                                                                                                                                                
corpus[result_idx]

'I prefer scikit-learn to Orange'