In [14]:
from numpy import loadtxt
import numpy as np
import matplotlib.pyplot as plt 
import re
import pandas as pd
import nltk
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import average_precision_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Preprocessing 

In [19]:
def preprocess(text):
    text = text.strip()
    text = text.replace(r'\d+','')
    text = re.sub(r'\d+','', text)
    return text

## Computing Precision, Recall, F1 score

In [20]:
def getPrecision(val_tup):
    tn, fp, fn, tp  = val_tup
    return tp/(tp+fp)

def getRecall(val_tup):
    tn, fp, fn, tp  = val_tup
    return tp/(tp+fn)

def getAccuracy(val_tup):
    tn, fp, fn, tp  = val_tup
    return (tp+tn)/(tp+tn+fp+fn)

def getF1Score(val_tup):
    pre = getPrecision(val_tup)
    rec = getRecall(val_tup)
    return 2*(pre*rec)/(pre+rec)

## Vectorization

In [17]:
def vectorization(data):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(data)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    retX = pd.DataFrame(denselist, columns=feature_names)
    return retX

## Split into Train and Test data

In [22]:
file = open(r'SMSSpamCollection', 'r')
df = pd.read_csv(r'SMSSpamCollection',sep="\t",header=None)
myDict = {'ham' : 0, 'spam' : 1}
for x in range(df[0].size):
    df[0][x] = myDict[df[0][x]]
    df[1][x] = preprocess(df[1][x])

In [23]:
text = vectorization(df[1])
cat = df[0]
cat = cat.astype('int')

In [24]:
train_text, test_text, train_cat, test_cat = train_test_split(text, cat, random_state = 2000)

## Logistic Regression 

In [25]:
LRmodel = LogisticRegression()
LRmodel.fit(train_text, train_cat)

LRoutput = LRmodel.predict(test_text)

LRmatrix = confusion_matrix(LRoutput, test_cat.values).ravel()

LRprecision = getPrecision(LRmatrix)
LRrecall = getRecall(LRmatrix)
LRaccuracy = getAccuracy(LRmatrix)
LRf1score = getF1Score(LRmatrix)

## SVM

In [26]:
SVM = svm.SVC()
SVM.fit(train_text, train_cat)

SVMoutput = SVM.predict(test_text)

SVMmatrix = confusion_matrix(SVMoutput, test_cat.values).ravel()

SVMprecision = getPrecision(SVMmatrix)
SVMrecall = getRecall(SVMmatrix)
SVMaccuracy = getAccuracy(SVMmatrix)
SVMf1score = getF1Score(SVMmatrix)

## KNN

In [27]:
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(train_text, train_cat)

KNNoutput = KNN.predict(test_text)

KNNmatrix = confusion_matrix(KNNoutput, test_cat.values).ravel()

KNNprecision = getPrecision(KNNmatrix)
KNNrecall = getRecall(KNNmatrix)
KNNaccuracy = getAccuracy(KNNmatrix)
KNNf1score = getF1Score(KNNmatrix)

## Neural Network

In [28]:
NeuralNetwork = Sequential()
NeuralNetwork.add(Dense(12, input_dim=7822, activation='relu'))
NeuralNetwork.add(Dense(8, activation='relu'))
NeuralNetwork.add(Dense(1, activation='sigmoid'))

NeuralNetwork.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

NeuralNetwork.fit(train_text, train_cat, epochs=20, batch_size=10)

NNoutput = NeuralNetwork.predict_classes(test_text)

NNmatrix = confusion_matrix(NNoutput, test_cat.values).ravel()

NNprecision = getPrecision(NNmatrix)
NNrecall = getRecall(NNmatrix)
NNaccuracy = getAccuracy(NNmatrix)
NNf1score = getF1Score(NNmatrix)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Output 

In [31]:
print('Type                  Precision           Recall                Accuracy             F1 Score')
print('-------------------------------------------------------------------------------------------------------')
print('Logistic Regression',LRprecision,LRrecall,LRaccuracy,LRf1score, sep='   ')
print('SVM                ',SVMprecision,SVMrecall,SVMaccuracy,SVMf1score, sep='   ')
print('KNN                ',KNNprecision,KNNrecall,'           ',KNNaccuracy,KNNf1score, sep='   ')
print('Neural Network     ',NNprecision,NNrecall,NNaccuracy,NNf1score, sep='   ')

Type                  Precision           Recall                Accuracy             F1 Score
-------------------------------------------------------------------------------------------------------
Logistic Regression   0.7631578947368421   0.9797297297297297   0.9655419956927495   0.8579881656804733
SVM                   0.8578947368421053   0.9878787878787879   0.9791816223977028   0.9183098591549295
KNN                   0.35789473684210527   1.0                 0.9124192390524049   0.5271317829457365
Neural Network        0.9263157894736842   0.9617486338797814   0.9849246231155779   0.9436997319034852
