<font size=5>SMS Text classification</font>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Import" data-toc-modified-id="Data-Import-1">Data Import</a></span><ul class="toc-item"><li><span><a href="#Define-some-functions" data-toc-modified-id="Define-some-functions-1.1">Define some functions</a></span></li></ul></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-2">Preprocessing</a></span></li><li><span><a href="#Feature-Extraction" data-toc-modified-id="Feature-Extraction-3">Feature Extraction</a></span><ul class="toc-item"><li><span><a href="#Word-Count" data-toc-modified-id="Word-Count-3.1">Word Count</a></span></li><li><span><a href="#Tf-Idf" data-toc-modified-id="Tf-Idf-3.2">Tf-Idf</a></span></li><li><span><a href="#N-gram" data-toc-modified-id="N-gram-3.3">N-gram</a></span></li></ul></li><li><span><a href="#Text-Classification" data-toc-modified-id="Text-Classification-4">Text Classification</a></span><ul class="toc-item"><li><span><a href="#Naive-Bayes" data-toc-modified-id="Naive-Bayes-4.1"><a href="https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering" target="_blank">Naive Bayes</a></a></span></li><li><span><a href="#SVM" data-toc-modified-id="SVM-4.2">SVM</a></span></li><li><span><a href="#LogisticRegression" data-toc-modified-id="LogisticRegression-4.3">LogisticRegression</a></span></li><li><span><a href="#GBDT" data-toc-modified-id="GBDT-4.4">GBDT</a></span></li></ul></li><li><span><a href="#word2vec" data-toc-modified-id="word2vec-5">word2vec</a></span></li><li><span><a href="#Bert-TensorFlow" data-toc-modified-id="Bert-TensorFlow-6">Bert-TensorFlow</a></span></li></ul></div>

In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
import itertools
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import svm

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set_style('white') 

# Data Import

In [None]:
data = pd.read_csv('../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv')
#stopword_list = [k.strip() for k in open("E:/MaLearning/souhu/stopwords.txt", encoding='utf8').readlines() if k.strip() != '']
stopword_list = stopwords.words('english')

## Define some functions

In [None]:

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Preprocessing

In [None]:
data.head()

In [None]:
data["Category"] = data["Category"].map({'ham': 0,'spam':1})

In [None]:
data.head()

#  Feature Extraction

There is several ways to extract features from text data, including word count method and tf-idf encoding. Now I will do both of them and compare their effect of predicting.

## Word Count

In [None]:
description_list = []
for article in data["Message"]:
    article = re.sub("[^a-zA-Z]"," ",article)
    article = article.lower()   # low case letter
    article = word_tokenize(article)
    lemma = WordNetLemmatizer()
    article = [ lemma.lemmatize(word) for word in article]
    article = " ".join(article)
    description_list.append(article) #we hide all word one section
    
    
def text_replace(text):
    '''some text cleaning method'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

<center>
    <img style="border-radius: 0.3125em;
    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
    src="https://i.loli.net/2019/11/18/kdH1gfSlezstUwL.png">
    <br>
    <div style="color:orange; border-bottom: 1px solid #d9d9d9;
    display: inline-block;
    color: #999;
    padding: 2px;">Word Count Vectorizer</div>
</center>

In [None]:
count_vectorizer = CountVectorizer(max_features = 100, stop_words = "english")
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()
tokens = count_vectorizer.get_feature_names()

In [None]:
print(type(sparce_matrix))
sparce_matrix = pd.DataFrame(sparce_matrix, columns=tokens)
sparce_matrix.head()

## Tf-Idf

 Term Frequency-Inverse Document Frequency

In [None]:
vectorizer = TfidfVectorizer(max_features = 100)
tfidfmatrix = vectorizer.fit_transform(description_list)
cname = vectorizer.get_feature_names()
tfidfmatrix = pd.DataFrame(tfidfmatrix.toarray(),columns=cname)
tfidfmatrix.head()

In [None]:
tfidfmatrix.columns

## N-gram 

In [None]:
count_vectorizer = CountVectorizer(max_features = 100, stop_words = "english",ngram_range=(2, 2),)
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()
tokens = count_vectorizer.get_feature_names()
gram2 = pd.DataFrame(sparce_matrix, columns=tokens)
gram2.head()

# Text Classification

## [Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering)

Naive Bayes gives us a baseline accuracy of predicting.

In [None]:

y = data.iloc[:,0].values   
x = sparce_matrix
tfidfx = tfidfmatrix

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 2019)
tf_x_train, tf_x_test, tf_y_train, tf_y_test = train_test_split(tfidfmatrix ,y,
                                                                test_size = 0.3,
                                                                random_state = 2019)

gm_x_train, gm_x_test, gm_y_train, gm_y_test = train_test_split(gram2 ,y,
                                                                test_size = 0.3,
                                                                random_state = 2019)

In [None]:
nb = GaussianNB()
nb.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',nb.score(x_test,y_test))
nb.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',nb.score(tf_x_test,tf_y_test))
nb.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',nb.score(gm_x_test,gm_y_test))

In [None]:
nb = MultinomialNB()
nb.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',nb.score(x_test,y_test))
nb.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',nb.score(tf_x_test,tf_y_test))
nb.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',nb.score(gm_x_test,gm_y_test))

In [None]:
nb = BernoulliNB()
nb.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',nb.score(x_test,y_test))
nb.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',nb.score(tf_x_test,tf_y_test))
nb.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',nb.score(gm_x_test,gm_y_test))

## SVM

In [None]:
%%time
svmmodel = svm.SVC(kernel='linear', C = 1)
svmmodel.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',svmmodel.score(x_test,y_test))
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',svmmodel.score(tf_x_test,tf_y_test))
svmmodel.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',svmmodel.score(gm_x_test,gm_y_test))

In [None]:
svmmodel = svm.SVC(kernel='linear', C = 1)
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',svmmodel.score(tf_x_test,tf_y_test))

## LogisticRegression

In [None]:
%%time
logit = LogisticRegression(random_state=0, solver='lbfgs')
logit.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',logit.score(x_test,y_test))
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',logit.score(tf_x_test,tf_y_test))
svmmodel.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',logit.score(gm_x_test,gm_y_test))

## GBDT

In [None]:
%%time
clf = GradientBoostingClassifier(n_estimators=50)
clf.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',clf.score(x_test,y_test))
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',clf.score(tf_x_test,tf_y_test))
svmmodel.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',clf.score(gm_x_test,gm_y_test))

# word2vec

In [None]:
description_list = []
for article in data["Message"]:
    article = re.sub("[^a-zA-Z]"," ",article)
    article = article.lower() 
    cutWords = [k for k in word_tokenize(article) if k not in stopword_list]
    cutWords = [ lemma.lemmatize(word) for word in cutWords]
    description_list.append(cutWords)
#description_list

In [None]:
def getVector_v2(cutWords, word2vec_model):
    vector_list = [word2vec_model[k] for k in cutWords if k in word2vec_model]
    vector_df = pd.DataFrame(vector_list)
    cutWord_vector = vector_df.mean(axis=0).values
    return cutWord_vector

word2vec_model = Word2Vec(description_list, size=100, iter=10, min_count=20)

In [None]:
vector_list = []
for c in description_list:
    vec = getVector_v2(c, word2vec_model)
    vector_list.append(vec)

In [None]:
X = pd.DataFrame(vector_list)
X.shape

In [None]:
Y = data["Category"]
Y = pd.DataFrame(Y)
Y.shape

In [None]:
X = X.fillna(X.mean())
Y = Y.dropna()

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, Y, test_size=0.3)
logistic_model = LogisticRegression()
logistic_model.fit(train_X, train_y)
y_predict = logistic_model.predict(test_X)

print('CountVectorizer Accuracy Score',accuracy_score(y_test, y_predict))
pd.DataFrame(confusion_matrix(y_test,y_predict))

In [None]:
clf = GradientBoostingClassifier(n_estimators=50)
gbdt = clf.fit(train_X, train_y)
y_predict = gbdt.predict(test_X)
print('CountVectorizer Accuracy Score',accuracy_score(y_test, y_predict))
pd.DataFrame(confusion_matrix(y_test,y_predict))

# Bert-TensorFlow

See this notebook: <https://www.kaggle.com/rikdifos/bert-test>