## Spam detection

1. Data Preparation
2. Building word dictionary
3. Feature extraction
4. Training classifiers
5. Testing 
6. Performance evaluation using multiple metrics


Data Preparation

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
sms = pd.read_csv('messages.csv',encoding='latin-1')
sms.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [4]:
sms.drop('subject',axis=1,inplace=True)

In [5]:
sms.shape

(2893, 2)

In [6]:
sms['label'].value_counts()

0    2412
1     481
Name: label, dtype: int64

In [7]:
#Ratio
print ('spam ratio = ', round(len(sms[sms['label']== 1]) / len(sms.label),2)*100,'%')
print ('ham ratio  = ', round(len(sms[sms['label']== 0]) / len(sms.label),2)*100,'%')

spam ratio =  17.0 %
ham ratio  =  83.0 %


In [8]:
#New column for Length of message
sms['length'] = sms.message.str.len()
sms.head(2)

Unnamed: 0,message,label,length
0,content - length : 3386 apple-iss research cen...,0,2856
1,"lang classification grimes , joseph e . and ba...",0,1800


In [9]:
#Label coding 0 and 1
sms['label'].replace({0 :'ham', 1:'spam'},inplace=True)
sms.head(5)

Unnamed: 0,message,label,length
0,content - length : 3386 apple-iss research cen...,ham,2856
1,"lang classification grimes , joseph e . and ba...",ham,1800
2,i am posting this inquiry for sergei atamas ( ...,ham,1435
3,a colleague and i are researching the differin...,ham,324
4,earlier this morning i was on the phone with a...,ham,1046


In [10]:
# Convert all messages to lower case
sms['message'] = sms['message'].str.lower()

In [11]:
# Replace email addresses with 'email'
sms['message'] = sms['message'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
sms['message'] = sms['message'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
sms['message'] = sms['message'].str.replace(r'£|\$', 'dollers')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
sms['message'] = sms['message'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumber')

    
# Replace numbers with 'numbr'
sms['message'] = sms['message'].str.replace(r'\d+(\.\d+)?', 'numbr')

In [12]:
# Remove punctuation
sms['message'] = sms['message'].str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
sms['message'] = sms['message'].str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
sms['message'] = sms['message'].str.replace(r'^\s+|\s+?$', '')

In [13]:
# Remove stopwords
import string
import nltk
from nltk.corpus import  stopwords

stop_words = set(stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])

sms['message'] = sms['message'].apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [14]:
sms.head()

Unnamed: 0,message,label,length
0,content length numbr apple iss research center...,ham,2856
1,lang classification grimes joseph e barbara f ...,ham,1800
2,posting inquiry sergei atamas satamas umabnet ...,ham,1435
3,colleague researching differing degrees risk p...,ham,324
4,earlier morning phone friend mine living south...,ham,1046


In [15]:
# New column (clean_length) after puncuations,stopwords removal
sms['clean_length'] = sms.message.str.len()
sms.head()

Unnamed: 0,message,label,length,clean_length
0,content length numbr apple iss research center...,ham,2856,2179
1,lang classification grimes joseph e barbara f ...,ham,1800,1454
2,posting inquiry sergei atamas satamas umabnet ...,ham,1435,1064
3,colleague researching differing degrees risk p...,ham,324,210
4,earlier morning phone friend mine living south...,ham,1046,629


In [16]:
from sklearn.model_selection import train_test_split

x = sms.message
y = sms.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 225)

print('x_train :', len(x_train))
print('x_test  :', len(x_test))
print('y_train :', len(y_train))
print('y_test  :', len(y_test))


x_train : 2603
x_test  : 290
y_train : 2603
y_test  : 290


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")





In [18]:
from sklearn.pipeline import Pipeline

model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

model.fit(x_train, y_train)


from sklearn.metrics import confusion_matrix

predictions = model.predict(x_test)

confusion_matrix(predictions, y_test)

array([[243,   6],
       [  0,  41]], dtype=int64)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

print("Accuracy : ", accuracy_score(predictions, y_test))
print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))
print("\n\nclassification Report : ", classification_report(predictions, y_test))

Accuracy :  0.9793103448275862
Precision :  0.9819515774027879
Recall :  0.9793103448275862


classification Report :                precision    recall  f1-score   support

         ham       1.00      0.98      0.99       249
        spam       0.87      1.00      0.93        41

    accuracy                           0.98       290
   macro avg       0.94      0.99      0.96       290
weighted avg       0.98      0.98      0.98       290



In [20]:
example = ["i have reersity towards a ma in linguistics ( but my bachelors is not in linguistics ) . i am looking for information regarding possible ( paid ) internships with companies working on projects which include a linguistic aspect . if anyone knows of any internship possibilities , or even knows which companies in israel are working on linguistics oriented projects , i would apretiate any information you can give me . thanks in advance , leah klearman klrmn @ zoot . tau . ac . il"]
result = model.predict(example)

print(result)

['ham']


In [24]:
example = ["welcome to free gifts! get your offers  are waiting for you grab it for free"]
result = model.predict(example)

print(result)

['spam']
