In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [2]:
#latin - 1 is the universal encoding of the text
data = pd.read_csv("spamdata.csv",encoding="latin-1")
data.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
#Feature Engineering - Data Preprocesing or Text Cleaning
def clean_text(text):
    # convert to lower case
    cleaned_text = text.lower()
    # remove punctuations
    cleaned_text = "".join(c for c in cleaned_text if c not in string.punctuation)
    #remove stopwords
    words = [word for word in cleaned_text.split() if word not in stopwords.words("english")]
    #lemmatization - context in which the word used retained
    # need to specify POS tag for lemmatization not for stemming
    words = [lem.lemmatize(word,"v") for word in words]
    words = [lem.lemmatize(word,"n") for word in words]
    #join the cleaned words
    cleaned_text = " ".join(words)
    return cleaned_text

#### Test the function 

In [4]:
clean_text("I will be playing a game today !!!")

'play game today'

In [5]:
data['cleaned'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though


#### Preprocessed data - cleaned.

It can be further extended to correction of spellings, removal of numbers in the text etc.

#### Feature Engineering

In [6]:
## Meta Features - counts / attributes associated with text data
data["word_count"] = data["text"].apply(lambda x : len(x.split()))
data["word_count_cleaned"] = data["cleaned"].apply(lambda x : len(x.split()))

## character count
data["char_count"] = data["text"].apply(lambda x: len(x))
data["char_count_without_spaces"] = data["text"].apply(lambda x: len(x.replace(" ","")))

## sum of number of digits present in each text
data["num_digit"] = data["text"].apply(lambda x : sum([1 if w.isdigit() else 0 for w in x.split()]))

In [7]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_digit
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,6,6,29,24,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though,13,8,61,49,0


#meta faetures can be used for any ML inputs

### POS tags

In [8]:
#count number of nouns or verbs present in the data. It can also be extended for pronous,Adjective or adverb
pos_dic = {"noun":["NNP","NN","NNS","NNPS"],"verb":["VBZ","VB","VBD","VBN","VBG","VBP"]}
def pos_check(text,family):
    #annotate text by pos
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    count = 0
    #print("Tags ")
    #print("="*10)
    for tag in tags:
        tag = tag[1]
        #print(tag)
        if tag in pos_dic[family]:
            count +=1
    return count

In [9]:
pos_check("They are playing in the ground","noun")

1

In [10]:
pos_check("They are playing in the ground","verb")

2

In [11]:
data["pos_noun_count"] = data["text"].apply(lambda x : pos_check(x,"noun"))
data["pos_verb_count"] = data["text"].apply(lambda x : pos_check(x,"verb"))

In [12]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_digit,pos_noun_count,pos_verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,6,6,29,24,0,4,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2,13,4
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0,3,3
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though,13,8,61,49,0,1,5


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 10 columns):
label                        5572 non-null object
text                         5572 non-null object
cleaned                      5572 non-null object
word_count                   5572 non-null int64
word_count_cleaned           5572 non-null int64
char_count                   5572 non-null int64
char_count_without_spaces    5572 non-null int64
num_digit                    5572 non-null int64
pos_noun_count               5572 non-null int64
pos_verb_count               5572 non-null int64
dtypes: int64(7), object(3)
memory usage: 435.4+ KB


#### Advanced Feature Engineering

In [14]:
#count as a feature -count of words in a document or corpus
#count features
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cvz = CountVectorizer()
cvz.fit(data["cleaned"].values)#finf corresponding counts
count_vectors = cvz.transform(data["cleaned"].values)

In [15]:
count_vectors

<5572x8206 sparse matrix of type '<class 'numpy.int64'>'
	with 46827 stored elements in Compressed Sparse Row format>

In [33]:
word_tfidf = TfidfVectorizer(max_features=500)
word_tfidf.fit(data["cleaned"].values)#finf corresponding counts
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [34]:
word_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 28313 stored elements in Compressed Sparse Row format>

In [38]:
#ngram level tfidf_features
#instaed of stacking word tfidf we can stack ngram tfidf
#represent max features=500 and bigram
ngram_tfidf = TfidfVectorizer(max_features=500, ngram_range=(1,2))
ngram_tfidf.fit(data["cleaned"].values)#finf corresponding counts
ngram_vectors_tfidf = ngram_tfidf.transform(data["cleaned"].values)

In [39]:
ngram_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 28613 stored elements in Compressed Sparse Row format>

#### another variation character level tfidf-analyzer = "char"

In [40]:
char_tfidf = TfidfVectorizer(max_features=500, analyzer="char")
char_tfidf.fit(data["cleaned"].values)#finf corresponding counts
char_vectors_tfidf = char_tfidf.transform(data["cleaned"].values)

In [41]:
char_vectors_tfidf

<5572x65 sparse matrix of type '<class 'numpy.float64'>'
	with 97087 stored elements in Compressed Sparse Row format>

#### 

In [21]:
#zip essentail keyword features / words with their corresponding tfidf counts
tfidf = dict(zip(word_tfidf.get_feature_names(),word_tfidf.idf_))
tf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf,orient="index")

In [22]:
tf_idf.columns=["word_tfidf"]
tf_idf

Unnamed: 0,word_tfidf
008704050406,8.527076
0089my,8.932542
0121,8.932542
01223585236,8.932542
01223585334,8.527076
0125698789,8.932542
02,8.932542
020603,8.016251
0207,8.527076
02070836089,8.932542


In [23]:
#features - frequency of words,nlp based features like pos/metafeatures - count of words, count of words etc.
#but to use them we have to create a matrix and then apply respective ML models for classifictaion

#### Combining Features

In [24]:
data.columns

Index(['label', 'text', 'cleaned', 'word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_digit',
       'pos_noun_count', 'pos_verb_count'],
      dtype='object')

In [47]:
#create a sparse matrix horizontal placement
from scipy.sparse import hstack,csr_matrix

meta_features = ['word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_digit',
       'pos_noun_count', 'pos_verb_count']

#data corresponding to meta features contained in feature_set 1
feature_set1 = data[meta_features]

In [48]:
#convert feature_set into a matrix
#also join idf values
train = hstack([word_vectors_tfidf,csr_matrix(feature_set1)],"csr")

In [49]:
train.shape

(5572, 507)

In [42]:
train

<5572x8213 sparse matrix of type '<class 'numpy.float64'>'
	with 80609 stored elements in Compressed Sparse Row format>

#### ML Classifier

In [44]:
#classify if a give text is a spam or not
#label encode target variable as they are spam and ham now
from sklearn.preprocessing import LabelEncoder
target = data["label"].values
target = LabelEncoder().fit_transform(target)


In [46]:
target

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [52]:
#split the data into training and validation set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train,target)

In [54]:
X_train.shape,y_train.shape

((4179, 507), (4179,))

In [55]:
X_test.shape,y_test.shape

((1393, 507), (1393,))

#### Classifiers

In [57]:
import numpy as np
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import accuracy_score

In [62]:
model = naive_bayes.MultinomialNB()
model.fit(X_train,y_train)
preds_nb = model.predict(X_test)
acc_nb = accuracy_score(y_test,preds_nb)
print("Naive Bayes Accuracy Score : ",acc_nb*100)

Naive Bayes Accuracy Score :  97.48743718592965


In [66]:
model = LogisticRegression()
model.fit(X_train,y_train)
preds_LR = model.predict(X_test)
acc_LR = accuracy_score(y_test,preds_LR)
print("Logistic Regression Accuracy Score : ",acc_LR*100)

Logistic Regression Accuracy Score :  97.5592246949031


In [67]:
model = svm.SVC()
model.fit(X_train,y_train)
preds_svm = model.predict(X_test)
acc_svm = accuracy_score(y_test,preds_svm)
print("SVM Accuracy Score : ",acc_svm*100)

SVM Accuracy Score :  93.3237616654702


#### SVM models require large amount of data to perform better

In [70]:
model = ensemble.ExtraTreesClassifier()#bagging
model.fit(X_train,y_train)
preds_bag = model.predict(X_test)
acc_bag = accuracy_score(y_test,preds_bag)
print("ExtraTreesClassifier Accuracy Score : ",acc_bag*100)

ExtraTreesClassifier Accuracy Score :  97.91816223977028


#### Here we do not need to go for complex models .Simple models shaoe good results.

In [72]:
model = ensemble.RandomForestClassifier()#bagging
model.fit(X_train,y_train)
preds_rf = model.predict(X_test)
acc_rf = accuracy_score(y_test,preds_rf)
print("Random Forest Classifier Accuracy Score : ",acc_bag*100)

Random Forest Classifier Accuracy Score :  97.91816223977028


#### SPAM CLASSIFICATION USING DEEP LEARNING

In [None]:
#represent text in word emebeddding format
#use pretrained emebddings
embeddings_index = ()
#word vectors for 2 million words
for i , line in enumerate(open("pretrained.vec",encoding="utf8")):
    if i ==0:
        continue
    value = line.split()
    print(line)
    embeddings_index[value[0]] = values[1:]
    

In [74]:
#convert text into word embeddings
from keras.preprocessing import text,sequence

In [75]:
token = text.Tokenizer()
token.fit_on_texts(data["text"])
word_index = token.word_index

In [76]:
word_index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'u': 6,
 'and': 7,
 'in': 8,
 'is': 9,
 'me': 10,
 'my': 11,
 'for': 12,
 'your': 13,
 'it': 14,
 'of': 15,
 'call': 16,
 'have': 17,
 'on': 18,
 '2': 19,
 'that': 20,
 'now': 21,
 'are': 22,
 'so': 23,
 'but': 24,
 'not': 25,
 'or': 26,
 'do': 27,
 'can': 28,
 'at': 29,
 "i'm": 30,
 'get': 31,
 'be': 32,
 'will': 33,
 'if': 34,
 'ur': 35,
 'with': 36,
 'just': 37,
 'no': 38,
 'we': 39,
 'this': 40,
 'gt': 41,
 '4': 42,
 'lt': 43,
 'up': 44,
 'when': 45,
 'ok': 46,
 'free': 47,
 'from': 48,
 'how': 49,
 'go': 50,
 'all': 51,
 'out': 52,
 'what': 53,
 'know': 54,
 'like': 55,
 'good': 56,
 'then': 57,
 'got': 58,
 'was': 59,
 'come': 60,
 'its': 61,
 'am': 62,
 'time': 63,
 'only': 64,
 'day': 65,
 'love': 66,
 'there': 67,
 'send': 68,
 'he': 69,
 'want': 70,
 'text': 71,
 'as': 72,
 'txt': 73,
 'one': 74,
 'going': 75,
 'by': 76,
 'home': 77,
 "i'll": 78,
 'need': 79,
 'about': 80,
 'r': 81,
 'lor': 82,
 'sorry': 83,
 'stop': 84,
 'st

In [77]:
X_train.shape

(4179, 507)

In [79]:
#convert the text into sequence of tokens
#also pad them in order to convert to equal vector length

X_train,X_test,y_train,y_test = train_test_split(data["text"],target)
trainx = sequence.pad_sequences(token.texts_to_sequences(X_train),maxlen=70)
valx = sequence.pad_sequences(token.texts_to_sequences(X_test),maxlen=70)

In [86]:
#generate sequence of tokens as iput and them to length 70
#create an embedding matrix - embeding vectors for keywords present in every text
#maxlengh is 300 in the pretrained model provided
#no of rows as documents, no of columns as no of words
#individivaul word emebedding where
#every row corresponds to the word and column represents vector notation
embedding_matrix = np.zeros((len(word_index)+1,300))

#to fill this emdedding matrix we need to iterate word by word
for word,i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if emdedding_vector is not None:
        embedding_matrix[i] = embedding_vector

NameError: name 'embedding_index' is not defined

In [None]:
embedding_matrix

#### Train model

In [87]:
#in this case we will write a function to train the model
def train_model(classifier,feature_vector_train,label,feature_vector_val,valid_y):
    classifier.fit(feature_vector_train,label)
    predictions = classifier.predict(feature_vector_val)
    predictions = predictions.argmax(axis=-1)
    return accurcay_score(predictions,valid_y)
    

In [88]:
from keras import layers,models,optimizers
def create_cnn():
    #input_size = 70
    input_layer = layers.input((70,))
    embedded_layer = layers.Embedding(len(word_index)+1,300,weights=[embedding_matrix],trainable=False)(input_layer)
    conv_layer = layers.Convolution1D(100,3,activation="relu")(embedding_layer)
    pooling_layer =  layers.GlobalMaxpool1D(conv_layer)
    output_layer = layers.Dense(50,activation="relu")(pooling_layer)
    output_layer = layers.Dropout(0.25)(pooling_layer)
    output_layer = layers.Dense(1,activation="sigmoid")(output_layer)
    model = models.Model(Inputs - Input_layer,outputs = output_layer)
    model.compile(optimizer=optimizers.Adam(),loss="binary_crossentropy")
    return model

In [None]:
classifier = create_cnn()
train_model(classifier,trainx,trainy,valx,valy)

In [None]:
#more convolutional layers can be added in order to improve performance.
#Dropouts can be experimented with different values