# Spam Email Detection using Linear Regression  

In [8]:
import numpy as np
import pandas as pd
import string 

# sklearnm --> classification, regression and clustering algorithms including support vector machines
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# NLTK --> for symbolic and statistical natural language processing for English
import nltk
from nltk.corpus import stopwords 
from nltk.stem import SnowballStemmer

# execute once
#nltk.download('stopwords')

In [17]:
data = pd.read_csv('spam.csv', encoding='latin')
data = data.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'})
data = data.rename(columns={'v1':'Spam/Not_Spam', 'v2':'message'})
data.head()

Unnamed: 0,Spam/Not_Spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
data.groupby(by = 'Spam/Not_Spam').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
Spam/Not_Spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [27]:
# message data cleaning by removing punctuation
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

In [28]:
# get text
message_data = data['message'].copy()
data_copy = message_data.apply(text_preprocess)

vectorizer = TfidfVectorizer("english")
message_mat = vectorizer.fit_transform(message_data)
message_mat

<5572x8672 sparse matrix of type '<class 'numpy.float64'>'
	with 73916 stored elements in Compressed Sparse Row format>

In [30]:
message_train, message_test, spam_nospam_train, spam_nospam_test = train_test_split(message_mat, 
                                                        data['Spam/Not_Spam'], test_size=0.3, random_state=20)

In [31]:
#model prediction
Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(message_train, spam_nospam_train)
pred = Spam_model.predict(message_test)
accuracy_score(spam_nospam_test,pred)

0.9575358851674641

Let's try using stemming and normalizing length of the messages

In [32]:
def stemmer (text):
    text = text.split()
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [35]:
message_data = message_data.apply(stemmer)
vectorizer = TfidfVectorizer("english")
message_mat = vectorizer.fit_transform(message_data)

In [37]:

message_train, message_test, spam_nospam_train, spam_nospam_test = train_test_split(message_mat, 
                                                        data['Spam/Not_Spam'], test_size=0.3, random_state=20)

In [38]:
Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(message_train, spam_nospam_train)
pred = Spam_model.predict(message_test)
accuracy_score(spam_nospam_test,pred)

0.9551435406698564

Learning Source:
    https://github.com/SharmaNatasha/Machine-Learning-using-Python/blob/master/Classification%20project/Spam_Detection.ipynb