In [1]:
import pandas as pd

##### Dataset Import 

In [2]:
data = 'spam.tsv'
dataset=pd.read_csv(data,sep='\t',header=None)
dataset


Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...


# Exploratory Data Analysis (EDA)

#### The better your domain knowledge on the data, the better your ability to engineer more features from it. Feature engineering is a very large part of spam detection in general.

In [3]:
dataset.columns=["label","message"]
dataset.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


#### Data Cleaning and text preprocessing

In [4]:
import re
import nltk

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sparsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')

In [7]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()


##### Tokenization-( process of converting the normal text strings in to a list of tokens(also known as lemmas)).


In [9]:
corpus=[]
for i in range(0,len(dataset)):
    review=re.sub('[^a-zA-Z0-9]',' ',dataset['message'][i])
    review=review.lower()
    review=review.split() #tokenise
    review=[ps.stem(word) for word in review if not word in stopword_list]
    review=' '.join(review)
    corpus.append(review)


In [10]:
corpus[:3]


['search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'nah think goe usf live around though']

#### Independent features and dependent


In [11]:
y=pd.get_dummies(dataset['label'],drop_first=True)

In [12]:
y

Unnamed: 0,spam
0,False
1,True
2,False
3,False
4,False
...,...
5562,True
5563,False
5564,False
5565,False


#### Train test split

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20)

In [14]:
X_train[:3]

['bugi oso near wat', 'lol boo hope laugh', 'gd got free ice cream oso wan']

In [15]:
dataset['label'].value_counts()

label
ham     4821
spam     746
Name: count, dtype: int64

#### Now we need to convert each of those messages into a vector the SciKit Learn's algorithm models can work with and machine learning model which we will going to use can understand

## Creating the Bag of Words (BOW)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))
X_train=cv.fit_transform(X_train).toarray()


In [17]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
X_test=cv.transform(X_test).toarray()
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
X_train.shape

(4453, 2500)

In [20]:
y_train.shape

(4453, 1)

In [21]:
cv.vocabulary_ #a dictionary with the mapping of the word index

{'bugi': 396,
 'oso': 1581,
 'near': 1474,
 'wat': 2365,
 'lol': 1279,
 'boo': 361,
 'hope': 1072,
 'laugh': 1218,
 'gd': 894,
 'got': 960,
 'free': 846,
 'ice': 1106,
 'cream': 592,
 'wan': 2346,
 'like': 1253,
 'tell': 2095,
 'fantasi': 787,
 'call': 409,
 '60p': 136,
 'min': 1391,
 'stop': 2024,
 'text': 2106,
 '08712460324': 23,
 'rate': 1734,
 'fantasi call': 788,
 '60p min': 137,
 'min stop': 1394,
 'stop text': 2030,
 'text call': 2108,
 'call 08712460324': 412,
 'get': 898,
 'unless': 2249,
 'guy': 1000,
 'want': 2353,
 'come': 535,
 'time': 2148,
 'want come': 2354,
 'ugh': 2240,
 'class': 510,
 'danc': 625,
 'sure': 2064,
 'result': 1793,
 'offer': 1539,
 'let': 1241,
 'know': 1190,
 'contact': 558,
 'settl': 1895,
 'room': 1812,
 'ok': 1550,
 'let know': 1242,
 'gud': 992,
 'ni8': 1496,
 'dear': 641,
 'well': 2395,
 'take': 2076,
 'care': 453,
 'swt': 2073,
 'dream': 698,
 'gud ni8': 995,
 'well take': 2397,
 'take care': 2077,
 'way': 2375,
 'sell': 1871,
 'sinc': 1925,
 'b

#### With messages represented as vectors, we can finally train our spam/ham classifier. Now we can actually use almost any sort of classification algorithms. For a variety of reasons, the Naive Bayes classifier algorithm is a good choice.

## Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier().fit(X_train,y_train)


  return fit_method(estimator, *args, **kwargs)


In [23]:
y_pred=classifier.predict(X_test)

In [24]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.981149012567325

In [25]:
confusion_matrix(y_test,y_pred)

array([[950,   0],
       [ 21, 143]])

In [26]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       950
        True       1.00      0.87      0.93       164

    accuracy                           0.98      1114
   macro avg       0.99      0.94      0.96      1114
weighted avg       0.98      0.98      0.98      1114



### Spam Classification Application

In [30]:
msg = input("Enter Message: ")
msgInput = cv.transform([msg])
predict = classifier.predict(msgInput)
if(predict[0]==0):
    print("------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------")
else:
    print("---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------")
##  Enter Message: Thanks for your subscription to Ringtone UK your mobile will be

------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------


### Naives Bayes Classifier

In [31]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [32]:
y_pred=clf.predict(X_test)

In [33]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.8563734290843806

In [34]:
confusion_matrix(y_test,y_pred)

array([[800, 150],
       [ 10, 154]])

In [35]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

       False       0.99      0.84      0.91       950
        True       0.51      0.94      0.66       164

    accuracy                           0.86      1114
   macro avg       0.75      0.89      0.78      1114
weighted avg       0.92      0.86      0.87      1114



In [36]:
msg = input("Enter Message: ")
msgInput = cv.transform([msg])
msgInput = msgInput.toarray()
predict = clf.predict(msgInput)
if(predict[0]==0):
    print("------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------------")
else:
    print("---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------")

---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------
