In [1]:
import re
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Load Data

In [2]:
df = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Text Preprocessing (Using Stemming)

In [4]:
def stemming(df):
    corpus = []
    ps = PorterStemmer()
    for i in range(len(df)):
        # remove all except letters
        sentence = re.sub('^a-zA-Z', ' ', df['message'][i])
        # make all letters lowercase
        sentence = sentence.lower()
        # split sentence into list of words
        words = sentence.split()
        # apply lemmatiztion to all words except stopwords
        words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
        # join the words to reform a sentence
        sentence = ' '.join(words)
        # append to list of lemmatized sentences
        corpus.append(sentence)
    return corpus

### Text Preprocessing (Using Lemmatization)

In [5]:
def lemmatization(df):
    corpus = []
    wnl = WordNetLemmatizer()
    for i in range(len(df)):
        # remove all except letters
        sentence = re.sub('^a-zA-Z', ' ', df['message'][i])
        # make all letters lowercase
        sentence = sentence.lower()
        # split sentence into list of words
        words = sentence.split()
        # apply stemming to all words except stopwords
        words = [wnl.lemmatize(word) for word in words if word not in stopwords.words('english')]
        # join the words to reform a sentence
        sentence = ' '.join(words)
        # append to list of stemmmed sentences
        corpus.append(sentence)
    return corpus

### CountVectorizer

In [6]:
def count_vectorizer(df, corpus):
    cv = CountVectorizer(max_features = 5000)
    X = cv.fit_transform(corpus).toarray()
    y = pd.get_dummies(df['label'])
    y = y.iloc[:, 1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    spam_classifier = MultinomialNB()
    spam_classifier.fit(X_train, y_train)
    y_preds = spam_classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_preds)
    accuracy = accuracy_score(y_test, y_preds)
    return cm, accuracy

### TFIDF

In [7]:
def tfidf_vectorizer(df, corpus):
    tfidf = TfidfVectorizer(max_features = 5000)
    X = tfidf.fit_transform(corpus).toarray()
    y = pd.get_dummies(df['label'])
    y = y.iloc[:, 1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    spam_classifier = MultinomialNB()
    spam_classifier.fit(X_train, y_train)
    y_preds = spam_classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_preds)
    accuracy = accuracy_score(y_test, y_preds)
    return cm, accuracy

#### Using Stemming & CountVectorizer

In [8]:
corpus = stemming(df)
matrix, acc = count_vectorizer(df, corpus)
print(matrix)
print(acc)

[[949   6]
 [  8 152]]
0.9874439461883409


#### Using Lemmatization & CountVectorizer

In [9]:
corpus = lemmatization(df)
matrix, acc = count_vectorizer(df, corpus)
print(matrix)
print(acc)

[[950   5]
 [  7 153]]
0.989237668161435


#### Using Stemming & TFIDF

In [10]:
corpus = stemming(df)
matrix, acc = tfidf_vectorizer(df, corpus)
print(matrix)
print(acc)

[[955   0]
 [ 22 138]]
0.9802690582959641


#### Using Lemmatization & TFIDF

In [11]:
corpus = lemmatization(df)
matrix, acc = tfidf_vectorizer(df, corpus)
print(matrix)
print(acc)

[[955   0]
 [ 22 138]]
0.9802690582959641
