##**Spam Detection in Email**

In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.stem import SnowballStemmer

In [35]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
message_data = pd.read_csv("spam.csv", encoding="latin")

In [14]:
message_data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [15]:
message_data = message_data.drop(
    ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [29]:
message_data = message_data.rename(columns={'v1': 'Label', 'v2': 'email'})

In [39]:
message_data

Unnamed: 0,Label,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [32]:
message_data.groupby('Label').describe()
message_data_copy = message_data['email'].copy()

In [33]:
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower()
            not in stopwords.words('english')]
    return " ".join(text)

In [36]:
message_data_copy = message_data_copy.apply(text_preprocess)


vectorizer = TfidfVectorizer()
message_mat = vectorizer.fit_transform(message_data_copy)

# Split data into training and testing
email_train, email_test, Label_train, Label_test = train_test_split(message_mat, message_data['Label'], test_size=0.3, random_state=20)


Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(email_train, Label_train)
pred = Spam_model.predict(email_test)

# Making predictions
print("accuracy_score:", accuracy_score(Label_test, pred))

accuracy_score: 0.9383971291866029


###Stemming
Stemming is the process of reducing a word to its word stem, which affixes to suffixes and prefixes or to the roots of words known as the lemma.

In [38]:
def stemmer(text):
    text = text.split()
    words = ""
    for i in text:
        stemmer = SnowballStemmer("english")
        words += (stemmer.stem(i))+" "
    return words


message_data_copy = message_data_copy.apply(stemmer)
vectorizer = TfidfVectorizer()
message_mat = vectorizer.fit_transform(message_data_copy)

# Splitting the data into training and testing sets
email_train, email_test, Label_train, Label_test = train_test_split(message_mat,message_data['Label'], test_size=0.3, random_state=20)

# Creating and training the logistic regression model
Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(email_train, Label_train)

# Making predictions
pred = Spam_model.predict(email_test)
print("accuracy_score after stemming:", accuracy_score(Label_test, pred))


accuracy_score after stemming: 0.9461722488038278


###Normalizing length

In [52]:
# Adding the length feature
message_data['length'] = message_data['email'].apply(len)

# Converting to numpy array
length = message_data['length'].to_numpy()

# Combining message_mat with the new length feature
new_mat = np.hstack((np.asarray(message_mat.todense()), length[:, None]))

# Normalizing the combined feature matrix
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
new_mat_normalized = scaler.fit_transform(new_mat)

# Splitting the data into training and testing sets
email_train, email_test, Label_train, Label_test = train_test_split(new_mat_normalized, message_data['Label'], test_size=0.3, random_state=20)

# Creating and training the logistic regression model
Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(email_train, Label_train)

# Making predictions
pred = Spam_model.predict(email_test)
print("accuracy_score after normalization:",
      accuracy_score(Label_test, pred))

accuracy_score after normalization: 0.9647129186602871
