In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
#Reading in spam sms dataset into memory
spamPath = '/home/samsuri/DataSets/SmsSpam/spam.csv'
df = pd.read_csv(spamPath, encoding="latin", usecols=[0,1], names=['classification', 'text'], skiprows=1)
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,classification,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [8]:
myDictionary = set()
for w in words.words():
    myDictionary.add(w.lower())

punctList = []
cleanedTextCorp = []

for index, row in df.iterrows():
    newTextList = []
    origTextList = word_tokenize(row['text'])
    
    tempPunctList = [(lambda s: 1 if '!' in s else 0)(origTextList),
                    (lambda s: 1 if '?' in s else 0)(origTextList)]
        
    for possibleWord in origTextList:
        if possibleWord in myDictionary:
            newTextList.append(possibleWord)
    
    punctList.append(tempPunctList)
    cleanedTextCorp.append(' '.join(newTextList)  )

In [9]:
vec = CountVectorizer()
freqVec = vec.fit_transform(cleanedTextCorp)
cleanedDf = pd.DataFrame(freqVec.toarray(), columns=vec.get_feature_names())

npPunctList = np.array(punctList)
cleanedDf['!Bool'] = npPunctList[:,0]
cleanedDf['?Bool'] = npPunctList[:,0]

In [10]:
classification = df['classification']
encodingDict = {'spam': 0, 'ham': 1}
classification = classification.replace(encodingDict)
xTrain, xTest, yTrain, yTest = train_test_split(cleanedDf, classification, test_size=.3, random_state=123)

In [11]:
mnb = MultinomialNB()
mnb.fit(xTrain, yTrain)
accuracy_score(yTest, mnb.predict(xTest))

0.9748803827751196

In [12]:
lr = LogisticRegression()
lr.fit(xTrain, yTrain)
accuracy_score(yTest, lr.predict(xTest))



0.9760765550239234

In [2]:
#Reading in spam sms dataset into memory
spamPath = '/home/samsuri/DataSets/SmsSpam/spam.csv'
df = pd.read_csv(spamPath, encoding="latin", usecols=[0,1], names=['classification', 'text'], skiprows=1)
df['text'] = df['text'].str.lower()
df.head(100)

Unnamed: 0,classification,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
5,spam,freemsg hey there darling it's been 3 week's n...
6,ham,even my brother is not like to speak with me. ...
7,ham,as per your request 'melle melle (oru minnamin...
8,spam,winner!! as a valued network customer you have...
9,spam,had your mobile 11 months or more? u r entitle...


In [3]:
vec = CountVectorizer()
freqVec = vec.fit_transform(df['text'])
dfCount = pd.DataFrame(freqVec.toarray(), columns=vec.get_feature_names())

In [4]:
classification = df['classification']
encodingDict = {'spam': 0, 'ham': 1}
classification.replace(encodingDict, inplace=True)
xTrain, xTest, yTrain, yTest = train_test_split(dfCount, classification, test_size=.3, random_state=123)

In [5]:
mnb = MultinomialNB()
mnb.fit(xTrain, yTrain)
accuracy_score(yTest, mnb.predict(xTest))

0.9814593301435407

In [6]:
lr = LogisticRegression()
lr.fit(xTrain, yTrain)
accuracy_score(yTest, lr.predict(xTest))



0.9826555023923444