In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("meruvulikith/190k-spam-ham-email-dataset-for-classification")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/meruvulikith/190k-spam-ham-email-dataset-for-classification/versions/1


# Natural Language Processing
Natural Language Processing (NLP) is the study of making computers understand how humans naturally speak, write and communicate.
The NLTK is a a collection of python libraries designed specially for identifying and tag parts of speech found in text of natural language like English.

In [None]:
import numpy as np
import pandas as pd

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [None]:
df = pd.read_csv('/root/.cache/kagglehub/datasets/meruvulikith/190k-spam-ham-email-dataset-for-classification/versions/1/spam_Emails_data.csv')

In [None]:
df.head()

Unnamed: 0,label,text
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...
1,Ham,got ice thought look az original message ice o...
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...
3,Spam,start increasing your odds of success & live s...
4,Ham,author jra date escapenumber escapenumber esca...


In [None]:
# Adding one more column with the name spam.
# Here if a mail is spam it will print 1 else 0.
df['spam'] = df['label'].map({'Spam': 1, 'Ham': 0}).astype(int)


In [None]:
df.head()

Unnamed: 0,label,text,spam
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...,1
1,Ham,got ice thought look az original message ice o...,0
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...,1
3,Spam,start increasing your odds of success & live s...,1
4,Ham,author jra date escapenumber escapenumber esca...,0


In [None]:
df.shape

(193852, 3)

In [None]:
df['spam'].value_counts()

Unnamed: 0_level_0,count
spam,Unnamed: 1_level_1
0,102160
1,91692


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193852 entries, 0 to 193851
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   193852 non-null  object
 1   text    193850 non-null  object
 2   spam    193852 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.4+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
label,0
text,2
spam,0


# Tokenization
Tokenization stands for splitting up of data into tokens, that is comma seperated values.

In [None]:
df['text'][199]

'look fers vi codinn escapenumber escapenumber escapenumber pi lls hydro codonee escapenumber escapenumber escapenumber pi lls valliuum escapenumber escapenumber escapenumber pi lls viagraa escapenumber escapenumber escapenumber pi lls cia llis escapenumber escapenumber escapenumber pi lls codeinne escapenumber escapenumber escapenumber pi lls xa naax escapenumber escapenumber escapenumber pi lls orderrs delivered fedex full tracking escapenumber escapenumber satisfactiionnss guaaranteeed http cdptescapenumberdsk com escapelong escapenumber time mailing nescapenumber mescapenumberval requi red escapelong'

In [None]:
def tokenizer(text):
    if isinstance(text, str):  # Check if the text is a string
        return text.split()
    else:
        return []  # Return an empty list for non-string values (e.g., NaN, float)


In [None]:
# Handle missing values by filling NaNs with empty strings or handle them accordingly
df['text'] = df['text'].fillna('').apply(tokenizer)

In [None]:
df['text'][1]

['got',
 'ice',
 'thought',
 'look',
 'az',
 'original',
 'message',
 'ice',
 'operations',
 'mailto',
 'iceoperations',
 'intcx',
 'com',
 'sent',
 'friday',
 'october',
 'escapenumber',
 'escapenumber',
 'escapenumber',
 'escapenumber',
 'pm',
 'subject',
 'escapelong',
 'amended',
 'participant',
 'agreement',
 'dear',
 'participant',
 'receiving',
 'email',
 'identified',
 'company',
 'user',
 'administrator',
 'legal',
 'counsel',
 'signatory',
 'escapelong',
 'participant',
 'agreement',
 'serves',
 'notice',
 'intercontinentals',
 'intent',
 'amend',
 'agreement',
 'effective',
 'december',
 'escapenumber',
 'escapenumber',
 'relevant',
 'documentation',
 'attached',
 'includes',
 'contact',
 'information',
 'questions',
 'thank',
 'escapelong',
 'inc']

# Stemming
Stemming is the process of removing of suffix to convert the word into core values. For example, converting waits, waiting, waited to the core word wait.

There are different stemmers in the package such as snowball, porter, lancaster, etc. Snowball will be utilized.

In [None]:
porter = SnowballStemmer("english", ignore_stopwords=False)

In [None]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [None]:
df['text'] = df['text'].apply(stem_it)

In [None]:
df['text'][1]

['got',
 'ice',
 'thought',
 'look',
 'az',
 'origin',
 'messag',
 'ice',
 'oper',
 'mailto',
 'iceoper',
 'intcx',
 'com',
 'sent',
 'friday',
 'octob',
 'escapenumb',
 'escapenumb',
 'escapenumb',
 'escapenumb',
 'pm',
 'subject',
 'escapelong',
 'amend',
 'particip',
 'agreement',
 'dear',
 'particip',
 'receiv',
 'email',
 'identifi',
 'compani',
 'user',
 'administr',
 'legal',
 'counsel',
 'signatori',
 'escapelong',
 'particip',
 'agreement',
 'serv',
 'notic',
 'intercontinent',
 'intent',
 'amend',
 'agreement',
 'effect',
 'decemb',
 'escapenumb',
 'escapenumb',
 'relev',
 'document',
 'attach',
 'includ',
 'contact',
 'inform',
 'question',
 'thank',
 'escapelong',
 'inc']

# Lemmitization
It is the process of finding lemma of a word depending on their meaning. It aims to remove inflectional endings. It helps in returning the base or dictionary form of a word, which is known as lemma. For example, converting is, am, was, are to the lemma word be.

Difference between Stemming and Lemmitization is that stemming can often create non-existent words, whereas lemmas are actual words.

In [None]:
df['text'][153]

['everi',
 'cook',
 'prais',
 'broth',
 'alway',
 'offer',
 'super',
 'thing',
 'gentlemen',
 'high',
 'price',
 'deliv',
 'need',
 'quick',
 'http',
 'damsel',
 'bz',
 'oziril',
 'com',
 'gb',
 'botani',
 'name',
 'expect',
 'order',
 'delet',
 'http',
 'armonk',
 'org',
 'oziril',
 'com',
 'gb',
 'rm',
 'php',
 'arboretum',
 'best',
 'regard',
 'patrick']

In [None]:
lemmitizer = WordNetLemmatizer()

In [None]:
def lemmit_it(text):
    return [lemmitizer.lemmatize(word, pos = 'a') for word in text]

In [None]:
df['text'] = df['text'].apply(lemmit_it)

In [None]:
df['text'][153]

['everi',
 'cook',
 'prais',
 'broth',
 'alway',
 'offer',
 'super',
 'thing',
 'gentlemen',
 'high',
 'price',
 'deliv',
 'need',
 'quick',
 'http',
 'damsel',
 'bz',
 'oziril',
 'com',
 'gb',
 'botani',
 'name',
 'expect',
 'order',
 'delet',
 'http',
 'armonk',
 'org',
 'oziril',
 'com',
 'gb',
 'rm',
 'php',
 'arboretum',
 'best',
 'regard',
 'patrick']

# StopWord Remmoval
It is used to remove common words such as is, an, the, etc. The search engine is programmed to ignore such words.

In [None]:
stop_words = stopwords.words('english')

In [None]:
def stop_it(text):
    review = [word for word in text if not word in stop_words]
    return review

In [None]:
df['text'] = df['text'].apply(stop_it)

In [None]:
df.head()

Unnamed: 0,label,text,spam
0,Spam,"[viiiiiiagraaaa, onli, one, want, make, scream...",1
1,Ham,"[got, ice, thought, look, az, origin, messag, ...",0
2,Spam,"[yo, ur, wom, ne, ed, escapenumb, ch, n, b, e,...",1
3,Spam,"[start, increas, odd, success, &, live, sexual...",1
4,Ham,"[author, jra, date, escapenumb, escapenumb, es...",0


In [None]:
df['text'] = df['text'].apply(' '.join)

# Vectorization
It is the method to convert textual data into numeric format. Since computers are unable to understand textual data, hence we need to convert text into numerical format.
TfidfVectorizer wil be used for vectorization, that is Term Frequency-Inverse Document Frequency.

In [None]:
tfidf = TfidfVectorizer()
y = df.spam.values
x = tfidf.fit_transform(df['text'])

In [None]:
import pickle

# Save the vectorizer for later use
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.2, shuffle = False)

In [None]:
df.head()

Unnamed: 0,label,text,spam
0,Spam,viiiiiiagraaaa onli one want make scream . pro...,1
1,Ham,got ice thought look az origin messag ice oper...,0
2,Spam,yo ur wom ne ed escapenumb ch n b e th n f h e...,1
3,Spam,start increas odd success & live sexual health...,1
4,Ham,author jra date escapenumb escapenumb escapenu...,0


# Logistic Regression
Logistic Regression is a statistical model used for binary classification tasks. It predicts the probability that an input belongs to a certain class (usually 0 or 1).

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred  = lr.predict(x_test)

In [None]:
acc_log = accuracy_score(y_pred, y_test)*100
print("Accuracy", acc_log)

Accuracy 97.8772794098682


In [None]:
#input_mail = input("Enter the mail text: ")
input_mail = 'Subject: increase mdq for contract 5910 sharon can you please increase hplcs mdq for contract 5910 from 50 , 000 mmbtu'
input_mail = [input_mail]
transformed_data = tfidf.transform(input_mail)

prediction = svc.predict(transformed_data)

if (prediction == 1):
    print("\nSpam mail")
else:
    print("\nHam mail")


Ham mail


In [None]:
import pickle

# Save the Logistic Regression model
with open('logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(lr, file)
