In [16]:
#importing libraries
import pandas as pd
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import spacy

In [2]:
data = pd.read_csv('spamdata.csv')
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [5]:
nlp = English()

In [7]:
# Preprocessing data
def clean_text(text):
    cleaned = text.lower()
    
    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)
    
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    
    filtered_sentence =[] 
    ## removing stopwords 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)
    
    return cleaned

In [8]:
data["cleaned"] = data["text"].apply(lambda x : clean_text(x))

In [9]:
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


# Feature Engineering

In [10]:
# Number of words in text
data["word_count"] = data["text"].apply(lambda x : len(x.split()))

In [11]:
# Number of words in cleaned text
data["word_count_cleand"] = data["cleaned"].apply(lambda x : len(x.split()))

In [12]:
# Number of characters including spaces in the cleaned text
data["char_count"] = data["cleaned"].apply(lambda x : len(x))

In [13]:
# number of characters excluding spaces in the cleaned text
data["char_count_without_spaces"] = data["cleaned"].apply(lambda x : len(x.replace(" ","")))

In [14]:
# Number of digits in the cleaned text
data["num_dig"] = data["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [15]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


In [17]:
nlp = spacy.load("en_core_web_sm")

In [18]:
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}

In [19]:
# Function for noun and verb counts
def pos_check(txt, family):

    # spacy document
    txt = nlp(txt)
    
    all_tags = []

    # Get pos tag
    for w in txt:
        all_tags.append(w.tag_)
    
    count = 0

    # Count number of nouns and verbs
    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [20]:
data["noun_count"] = data["cleaned"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["cleaned"].apply(lambda x : pos_check(x, "verb"))

In [21]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,7,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,4,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,10,1
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,5,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,1,1


# Model Building

In [22]:
# Label encoding target variable
from sklearn.preprocessing import LabelEncoder 

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [34]:
train=data.drop(['label','text','cleaned'],axis=1)

In [35]:
# Train-Validation split
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target,test_size=0.2)

In [36]:
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4457, 7), (4457,)), ((1115, 7), (1115,)))

In [37]:
from sklearn import naive_bayes

In [38]:
model = naive_bayes.MultinomialNB()

In [39]:
model.fit(x_train, y_train)

MultinomialNB()

In [40]:
pred_train = model.predict(x_train)

pred_valid = model.predict(x_valid)

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
accuracy_score(y_train, pred_train)

0.9409916984518735

In [43]:
accuracy_score(y_valid, pred_valid)

0.9363228699551569

# TF-IDF Features

In [44]:
# Import Tf-Idf Vectoriser
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
word_tfidf = TfidfVectorizer(max_features=250)

In [98]:
word_tfidf.fit(data["cleaned"].values)

TfidfVectorizer(max_features=250)

In [99]:
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [100]:
word_vectors_tfidf

<5572x250 sparse matrix of type '<class 'numpy.float64'>'
	with 16825 stored elements in Compressed Sparse Row format>

In [101]:
# Combining meta features and Tf-Idf features
from scipy.sparse import hstack, csr_matrix

# features
features = data.drop(['label','text','cleaned'],axis=1)

# Combined features
train = hstack([word_vectors_tfidf, csr_matrix(features)], "csr")

In [102]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target,test_size=0.2)

In [103]:
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4457, 257), (4457,)), ((1115, 257), (1115,)))

In [104]:
model = naive_bayes.MultinomialNB()

In [105]:
model.fit(x_train, y_train)

MultinomialNB()

In [106]:
pred_train = model.predict(x_train)

pred_valid = model.predict(x_valid)

In [107]:
from sklearn.metrics import f1_score

In [108]:
score=f1_score(pred_train,y_train)

In [109]:
print("Training F1 score is ",score)

Training F1 score is  0.8743961352657004


In [110]:
score=f1_score(pred_valid,y_valid)

In [111]:
print("Validation F1 score is ",score)

Validation F1 score is  0.8698412698412697
