# TOC

__Lab 06 - Text analysis__
1. [Import](#Import)
1. [Representing text as numberal data](#Representing-text-as-numberal-data)
    1. [Example 1 - learn a small vocabulary](#learn-a-small-vocabulary-Example1)
1. [Case study - text message analysis](#Case-study-text-message-analysis)
    1. [Classify with multinomial naive bayes](#Classify-with-multinomial-naive-bayes)
    1. [Classify with logistic regression](#Classify-with-logistic-regression)
1. [Parameter tuning w/ CountVectorizer](#Parameter-tuning-w/-CountVectorizer)

# Import

<a id = "Import"></a>

In [1]:
import numpy as pd
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style="whitegrid", font_scale=1.3)

# Representing text as numberal data

Text data can be represented as numberal data by tokenization

<a id = "Representing-text-as-numberal-data"></a>

## Example - learn a small vocabulary

Text data can be represented as numberal data by 'tokenized'
- Tokenize the vocabulary learned from a small set of training data
- Transform a test string based on the training vocabulary

<a id = "learn-a-small-vocabulary-Example1"></a>

In [2]:
# load data
simple_train = ["call you tonight", "Call me a cab", "please call me... PLEASE!"]
vect = CountVectorizer()

# learn the 'vocabulary' of the training data
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [3]:
# Inspect
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [4]:
# represent each sample in DataFrame
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm.toarray()

pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [5]:
# tokenize test data string
simpleTest = ["please don't call me"]

simple_test_dtm = vect.transform(simpleTest)
simple_test_dtm.toarray()

pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


> Notice that the word "don't" was ignored because this word was not in the learned vocabulary

# Case study - text message analysis - SPAM or not?

Build classifier to determine was an SMS text message is SPAM or not

<a id = "Case-study-text-message-analysis"></a>

In [6]:
# load data
sms = pd.read_csv(
    "s3://tdp-ml-datasets/misc/sms_spam_ham.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
    names=["label", "message"],
)
sms["labelNum"] = sms.label.map({"ham": 0, "spam": 1})

In [7]:
# inspect
sms.shape

(5574, 3)

In [8]:
# review messages and labels
X = sms["message"]
y = sms["labelNum"]
print(X.shape)
print(y.shape)

(5574,)
(5574,)


In [9]:
y

0       0
1       0
2       1
3       0
4       0
5       1
6       0
7       0
8       1
9       1
10      0
11      1
12      1
13      0
14      0
15      1
16      0
17      0
18      0
19      1
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
5544    0
5545    0
5546    0
5547    0
5548    0
5549    1
5550    0
5551    0
5552    0
5553    0
5554    0
5555    0
5556    0
5557    0
5558    0
5559    0
5560    0
5561    0
5562    0
5563    0
5564    0
5565    0
5566    0
5567    0
5568    1
5569    1
5570    0
5571    0
5572    0
5573    0
Name: labelNum, Length: 5574, dtype: int64

In [10]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4180,)
(1394,)
(4180,)
(1394,)


In [11]:
# learn the vocabulary - Vectorize the SMS dataset
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())[:7]

Unnamed: 0,00,000,008704050406,0089,0121,01223585236,01223585334,0125698789,02,0207,...,yup,zaher,zealand,zebra,zed,zhong,zoe,zoom,zouk,èn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# transform test set based on learned vocabulary
X_test_dtm = vect.transform(X_test)
pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names())[:7]

Unnamed: 0,00,000,008704050406,0089,0121,01223585236,01223585334,0125698789,02,0207,...,yup,zaher,zealand,zebra,zed,zhong,zoe,zoom,zouk,èn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Classify with multinomial naive bayes

<a id = "Classify-with-multinomial-naive-bayes"></a>

In [13]:
# create Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
# test set predictions
y_pred_class = nb.predict(X_test_dtm)

In [15]:
# evaluate predictions
metrics.accuracy_score(y_test, y_pred_class)

0.9885222381635581

In [16]:
# create confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1206,    6],
       [  10,  172]])

In [17]:
# print message for false positives (ham incorrectly labeled spam)
X_test[y_test < y_pred_class]

3415                    No pic. Please re-send.
4600    Have you laid your airtel line to rest?
574                      Waiting for your call.
45             No calls..messages..missed calls
4702                     I liked the new mobile
4622         Received, understood n acted upon!
Name: message, dtype: object

In [18]:
# print message for false negatives (spam incorrectly labeled ham)
X_test[y_test > y_pred_class]

1875    Would you like to see my XXX pics they are so ...
4514    Money i have won wining number 946 wot do i do...
684     Hi I'm sue. I am 20 years old and work as a la...
5       FreeMsg Hey there darling it's been 3 week's n...
5037    You won't believe it but it's true. It's Incre...
3419    LIFE has never been this much fun and great un...
4069    TBS/PERSOLVO. been chasing us since Sept for£3...
2823    ROMCAPspam Everyone around should be respondin...
4256    Block Breaker now comes in deluxe format with ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
Name: message, dtype: object

In [19]:
# review specific example
X_test[2247]

KeyError: 2247

In [None]:
# calculated predicted probabilities for X_test_dtm
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate area under the curve score
metrics.roc_auc_score(y_test, y_pred_prob)

__Evaluate internal probabilities__

In [None]:
# gather feature names
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

In [None]:
# examine first fifty tokens
print(X_train_tokens[0:50])

In [None]:
# examine last fifty tokens
print(X_train_tokens[-50:])

In [None]:
# rows = classes, columns = tokens
nb.feature_count_

In [None]:
# number of times each token appears in each type of message
ham_token_count = nb.feature_count_[0, :]
spam_token_count = nb.feature_count_[1, :]

tokens = pd.DataFrame(
    {"token": X_train_tokens, "ham": ham_token_count, "spam": spam_token_count}
).set_index("token")
tokens[:7]

In [None]:
# sample from tokens
tokens.sample(10, random_state=9)

In [None]:
# add 1 to each token count to avoid div by 0
tokens["ham"] = tokens["ham"] + 1
tokens["spam"] = tokens["spam"] + 1
tokens.sample(10, random_state=9)

In [None]:
# covert ham and spam counts into frequencies
# divide the number of times a word appears by the total number of observations in that class
# these probabilities are used to calculate conditional probability for class designation
tokens["ham"] = tokens["ham"] / nb.class_count_[0]
tokens["spam"] = tokens["spam"] / nb.class_count_[1]
tokens.sample(10, random_state=9)

In [None]:
# add spam-to-ham ratio
tokens["spam_ratio"] = tokens["spam"] / tokens["ham"]
tokens.sample(10, random_state=9)

In [None]:
# sort by spam_ratio descending to see the 'spammiest' words
tokens.sort_values(["spam_ratio"], ascending=[False])[:10]

In [None]:
# sort by spam_ratio ascending to see the least 'spammiest' words
tokens.sort_values(["spam_ratio"], ascending=[True])[:10]

## Classify with logistic regression

<a id = "Classify-with-logistic-regression"></a>

In [None]:
# creaet and fit logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_dtm, y_train)

In [None]:
# test set predictions
y_pred_class = log_reg.predict(X_test_dtm)

In [None]:
# evaluate predictions
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# review predicted probabilities
y_pred_prob = log_reg.predict_proba(X_test_dtm)[:, 1]
metrics.roc_auc_score(y_test, y_pred_prob)

# Paramter tuning with CountVectorizer 


<a id = "Parameter-tuning-w/-CountVectorizer"></a>

In [None]:
# show default params
vect

In [None]:
# remove English stop words
vect = CountVectorizer(stop_words="English")

In [None]:
# expand scope of tokenization. a range of (1,1) makes tokens of single words
# a range of (1,2) expands the scope of tokeization so that each pair of words also becomes
# a token. this allows for context of word usage to enter the model, but makes the
# document-word matrix larger
vect = CountVectorizer(ngram_range=(1, 2))

In [None]:
# ignore terms that appear in X% or more of the documents
vect = CountVectorizer(max_df=0.5)

In [None]:
# only keep items that appear in X or more documents
vect = CountVectorizer(min_df=0.5)