## Importing data 

In [1]:
import pandas as pd
import string
punctuation = string.punctuation
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [2]:
data = pd.read_csv("spamdata.csv")

In [3]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Here ham means, its not a spam

#### Now, we have to remove noise,  unnecessary redundent information present. Let's clean the data

In [4]:
def _clean(text):
    
    cleaned_text = text.lower()
    cleaned_text = "".join(c for c in cleaned_text if c not in punctuation)
    
    words = cleaned_text.split()
    words = [w for w in words if w not in stopwords]
    
    words = [lem.lemmatize(word, "v") for word in words]
    words = [lem.lemmatize(word, "n") for word in words]

    
    cleaned_text = " ".join(words)
    
    return cleaned_text

In [5]:
_clean("I will be playing a game today !!")

'play game today'

In [6]:
 data['cleaned'] = data['text'].apply(_clean)

In [7]:
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though


## Feature Engineering 

#### Meta features 

In [8]:
data['word_counts'] = data['text'].apply(lambda x : len(x.split()))
data['word_count_cleaned'] = data['cleaned'].apply(lambda x : len(x.split()))

data['char_counts'] = data['text'].apply(lambda x : len(x))
data['char_counts_without_spaces'] = data['text'].apply(lambda x : len(x.replace(" ", "")))

data['num_digits'] = data['text'].apply(lambda x : sum([1 if w.isdigit() else 0 for w in x.split(" ")]))

In [9]:
data.head()

Unnamed: 0,label,text,cleaned,word_counts,word_count_cleaned,char_counts,char_counts_without_spaces,num_digits
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,6,6,29,24,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though,13,8,61,49,0


#### POS features extraction

In [10]:
pos_dict = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb": ["VB", "VBZ", "VBD", "VBN", "VBG"]}

In [11]:
import nltk 
def pos_count(text, family):
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    count = 0
    for tag in tags:
        tag = tag[1]
        if tag in pos_dict[family]:
            count +=1
    return count

In [12]:
pos_count("They are playing in the ground", "verb")

1

In [13]:
data["noun_count"] = data["text"].apply(lambda x : pos_count(x, "noun"))
data["verb_count"] = data["text"].apply(lambda x : pos_count(x, "verb"))

In [14]:
data.head()

Unnamed: 0,label,text,cleaned,word_counts,word_count_cleaned,char_counts,char_counts_without_spaces,num_digits,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,6,6,29,24,0,4,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2,13,3
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0,3,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though,13,8,61,49,0,1,4


In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
cvz = CountVectorizer()
cvz.fit(data['cleaned'].values)
count_vectors = cvz.transform(data['cleaned'].values)

In [17]:
count_vectors

<5572x8207 sparse matrix of type '<class 'numpy.int64'>'
	with 46862 stored elements in Compressed Sparse Row format>

#### Calculating word count, ngram cound, character count and their respective TFIDF score

In [27]:
#word level
word_tfidf = TfidfVectorizer(max_features = 500)
word_tfidf.fit(data["cleaned"].values)
word_vectors_tfidf = word_tfidf.transform(data['cleaned'].values)

In [29]:
#ngram level
ngram_tfidf = TfidfVectorizer(max_features = 500, ngram_range = (1, 2))
ngram_tfidf.fit(data["cleaned"].values)
ngram_vectors_tfidf = ngram_tfidf.transform(data['cleaned'].values)

In [31]:
#char level
char_tfidf = TfidfVectorizer(max_features = 500, analyzer = "char")
char_tfidf.fit(data["cleaned"].values)
char_vectors_tfidf = char_tfidf.transform(data['cleaned'].values)

In [33]:
tfidf = dict(zip(word_tfidf.get_feature_names(), word_tfidf.idf_))
tfidf_idf = pd.DataFrame(columns = ["word_tfidf"]).from_dict(tfidf, orient = "index")
tfidf_idf.columns=["word_tfidf"]
tfidf_idf

Unnamed: 0,word_tfidf
10,6.629957
100,5.936809
1000,5.754488
150,5.888019
150p,6.406813
150ppm,6.070341
16,5.693863
18,5.841499
1st,6.014771
2000,5.988103


### Combining features 

In [34]:
from scipy.sparse import hstack, csr_matrix

data.columns

Index(['label', 'text', 'cleaned', 'word_counts', 'word_count_cleaned',
       'char_counts', 'char_counts_without_spaces', 'num_digits', 'noun_count',
       'verb_count'],
      dtype='object')

In [35]:
meta_features = ['word_counts', 'word_count_cleaned',
       'char_counts', 'char_counts_without_spaces', 'num_digits', 'noun_count',
       'verb_count']

feature_set1 = data[meta_features]

train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")

train

<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 61715 stored elements in Compressed Sparse Row format>

### Converting categorical data into label encoded one. 

In [36]:
from sklearn.preprocessing import LabelEncoder

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [37]:
target


array([0, 0, 1, ..., 0, 0, 0])

#### dividing into training and validation set 

In [38]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(train, target)

In [39]:
train.shape

(5572, 507)

In [40]:
val_x.shape

(1393, 507)

In [41]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import accuracy_score

### Naive Bayes 

In [42]:
model = naive_bayes.MultinomialNB()
model.fit(train_x, train_y)
pred = model.predict(val_x)
accuracy_score(pred, val_y)

0.9727207465900933

### Logistic Regression 

In [43]:
model = LogisticRegression()
model.fit(train_x, train_y)
pred = model.predict(val_x)
accuracy_score(pred, val_y)



0.9763101220387652

### SVM 

In [44]:
model = svm.SVC()
model.fit(train_x, train_y)
pred = model.predict(val_x)
accuracy_score(pred, val_y)



0.9346733668341709

### Ensemble modeling 

In [45]:
model = ensemble.ExtraTreesClassifier()
model.fit(train_x, train_y)
pred = model.predict(val_x)
accuracy_score(pred, val_y)



0.9806173725771715