In [1]:
import pandas as pd
import numpy as np
import spacy as sp
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [2]:
df=pd.read_csv('spamdata.csv')

In [3]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
print(4825/5572, 747/5572)

0.8659368269921034 0.13406317300789664


In [6]:
#text cleaning
lem=WordNetLemmatizer()

    

In [7]:
k_new="Economic Times reports on the two MNCs' plans to layoff workforce. According to one ET report, Infosys will remove 10 per cent of the workforce, which comes to around 2,200 people, in Job Level 6 (JL6) i.e senior managers."

In [8]:
ls=[x for x in k_new if x not in string.punctuation]

In [9]:
ls="".join(ls)

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
ls

'Economic Times reports on the two MNCs plans to layoff workforce According to one ET report Infosys will remove 10 per cent of the workforce which comes to around 2200 people in Job Level 6 JL6 ie senior managers'

In [11]:
print(len(ls.split()))
len(word_tokenize(ls))

39


39

In [12]:
sent_tokenize(k_new)

["Economic Times reports on the two MNCs' plans to layoff workforce.",
 'According to one ET report, Infosys will remove 10 per cent of the workforce, which comes to around 2,200 people, in Job Level 6 (JL6) i.e senior managers.']

In [13]:
#text cleaning
lem=WordNetLemmatizer()
def _clean(spam_text):
    spam_text=spam_text.lower()
    stopwords_list=stopwords.words('english')
    cleaned=[cleaned for cleaned in spam_text if cleaned not in string.punctuation]
    cleaned="".join(cleaned)
    cleaned=cleaned.split()
    cleaned=[x for x in cleaned if x not in stopwords_list]
    cleaned = [lem.lemmatize(word, "v") for word in cleaned]
    cleaned = [lem.lemmatize(word, "n") for word in cleaned]
    cleaned=" ".join(cleaned)
    return cleaned

In [14]:
df['cleaned']=df.text.apply(lambda x:_clean(x))

FEATURE ENGINEERING 

In [15]:
#time to create new feature .... first of all meta features.

df['word_count']=df.text.apply(lambda x: len(x.split()))
df['word_count_clean']=df.cleaned.apply(lambda x: len(x.split()))
df['char_count']=df.text.apply(lambda x: len(x))
df['char_count_without_spaces']=df.text.apply(lambda x:len(x.replace(" ","")))


In [16]:
df['no_of_digis']=df.text.apply(lambda x: sum([1 if w.isdigit() else 0 for w in x.split()]))

In [17]:
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}
import nltk
def pos_check(txt, family):
    tags = nltk.pos_tag(nltk.word_tokenize(txt))
    count = 0
    for tag in tags:
        tag = tag[1]
        if tag in pos_dic[family]:
            count += 1 
    return count

# pos_check("They are playing in the ground", "verb")

df["noun_count"] = df["text"].apply(lambda x : pos_check(x, "noun"))
df["verb_count"] = df["text"].apply(lambda x : pos_check(x, "verb"))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [19]:
word_tfidf =TfidfVectorizer(max_features=500)
word_tfidf.fit(df["cleaned"].values)
word_vectors_tfidf = word_tfidf.transform(df["cleaned"].values)

In [20]:
tfidf = dict(zip(word_tfidf.get_feature_names(), word_tfidf.idf_))
tfidf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf, orient="index")
tfidf_idf.columns=["word_tfidf"]
tfidf_idf

Unnamed: 0,word_tfidf
10,6.629957
100,5.936809
1000,5.754488
150,5.888019
150p,6.406813
...,...
yo,6.099328
youre,5.655397
yr,6.581166
yup,5.841499


In [21]:
from scipy.sparse import hstack, csr_matrix

In [22]:
meta_features = ['word_count', 'word_count_clean',
       'char_count', 'char_count_without_spaces', 'no_of_digis', 'noun_count',
       'verb_count']

feature_set1 = df[meta_features]

train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")
train

<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 61715 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.preprocessing import LabelEncoder 

target = df["label"].values
target = LabelEncoder().fit_transform(target)

In [24]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train, target)

print(train_x.shape, test_x.shape)

(4179, 507) (1393, 507)


In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
model_dtc=DecisionTreeClassifier()
model_dtc.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [27]:
model_dtc.score(test_x, test_y)

0.9583632447954056

In [28]:
from sklearn.svm import SVC

In [29]:
svc=SVC()
svc.fit(train_x, train_y)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [30]:
svc.score(test_x, test_y)

0.9432878679109835

In [31]:
from xgboost import XGBClassifier

In [32]:
xgb=XGBClassifier()
xgb.fit(train_x, train_y)
print(xgb.score(train_x, train_y), xgb.score(test_x, test_y))

0.9818138310600623 0.9720028715003589
