In [None]:
pip install demoji



In [None]:
import demoji
import nltk
import math
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')
demoji.download_codes()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  demoji.download_codes()


In [None]:
# Libraries
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
import spacy

In [None]:
# Step - 1
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
df.describe()

Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [None]:
# Q - 2
df.drop_duplicates(subset=['text'], keep='first', inplace=True)

df.dropna(subset=['text'], inplace=True)

def clean_text(text):
    text = re.sub(r'[^A-Za-z ]+', ' ', text)
    text = ' '.join(text.split())
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

print("Text")
print(df['text'].head())
print()
print("Cleaned Text")
print(df['cleaned_text'].head())

Text
0    Subject naturally irresistible your corporate ...
1    Subject the stock trading gunslinger fanny is ...
2    Subject unbelievable new homes made easy im wa...
3    Subject 4 color printing special request addit...
4    Subject do not have money get software cds fro...
Name: text, dtype: object

Cleaned Text
0    Subject naturally irresistible your corporate ...
1    Subject the stock trading gunslinger fanny is ...
2    Subject unbelievable new homes made easy im wa...
3    Subject color printing special request additio...
4    Subject do not have money get software cds fro...
Name: cleaned_text, dtype: object


In [None]:
# Q - 3
def remove_emojis(text):
    return demoji.replace(text, '')

df['text'] = df['text'].apply(remove_emojis)

for i in range(5):
    print("Email with Emojis Removed:", df['text'][i])

Email with Emojis Removed: Subject naturally irresistible your corporate identity lt is really hard to recollect a company the market is full of suqgestions and the information isoverwhelminq but a good catchy logo stylish statlonery and outstanding website will make the task much easier we do not promise that havinq ordered a iogo your company will automaticaily become a world ieader it isguite ciear that without good products effective business organization and practicable aim it will be hotat nowadays market but we do promise that your marketing efforts will become much more effective here is the list of clear benefits creativeness hand made original logos specially done to reflect your distinctive company image convenience logo and stationery are provided in all formats easy to use content management system letsyou change your website content and even its structure promptness you will see logo drafts within three business days affordability your marketing break through shouldn t ma

In [None]:
# Q - 4
stemmer = PorterStemmer()

def perform_stemming(text):
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

df['stemmed_text'] = df['text'].apply(perform_stemming)

nlp = spacy.load("en_core_web_sm")

def perform_lemmatization_spacy(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_words)

df['lemmatized_text_spacy'] = df['text'].apply(perform_lemmatization_spacy)

nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()

def perform_lemmatization_nltk(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [wnl.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

df['lemmatized_text_nltk'] = df['text'].apply(perform_lemmatization_nltk)

for i in range(5):
    print("Original Text:", df['text'][i])
    print("Stemmed Text (NLTK):", df['stemmed_text'][i])
    print("Lemmatized Text (spaCy):", df['lemmatized_text_spacy'][i])
    print("Lemmatized Text (NLTK):", df['lemmatized_text_nltk'][i])
    print()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text: Subject naturally irresistible your corporate identity lt is really hard to recollect a company the market is full of suqgestions and the information isoverwhelminq but a good catchy logo stylish statlonery and outstanding website will make the task much easier we do not promise that havinq ordered a iogo your company will automaticaily become a world ieader it isguite ciear that without good products effective business organization and practicable aim it will be hotat nowadays market but we do promise that your marketing efforts will become much more effective here is the list of clear benefits creativeness hand made original logos specially done to reflect your distinctive company image convenience logo and stationery are provided in all formats easy to use content management system letsyou change your website content and even its structure promptness you will see logo drafts within three business days affordability your marketing break through shouldn t make gaps in y

In [None]:
# Q - 5 (Manually)
def remove_stop_words_manual(text):
    stop_words = set([
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
        "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
        "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
        "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
        "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up",
        "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
        "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
        "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven",
        "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
    ])

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

df['text'] = df['text'].apply(remove_stop_words_manual)
for i in range(5):
    print("Email without Stop Words :", df['text'][i])

Email without Stop Words : Subject naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easier promise havinq ordered iogo company automaticaily become world ieader isguite ciear without good products effective business organization practicable aim hotat nowadays market promise marketing efforts become much effective list clear benefits creativeness hand made original logos specially done reflect distinctive company image convenience logo stationery provided formats easy use content management system letsyou change website content even structure promptness see logo drafts within three business days affordability marketing break make gaps budget 100 satisfaction guaranteed provide unlimited amount changes extra fees surethat love result collaboration look portfolio interested
Email without Stop Words : Subject stock trading gunslinger fanny merr

In [None]:
# Q - 5 (Using NLTK)
def remove_stop_words_nltk(text):
    stop_words = set(stopwords.words("english"))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

df['text'] = df['text'].apply(remove_stop_words_nltk)
for i in range(5):
    print("Email without Stop Words :", df['text'][i])

Email without Stop Words : Subject naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easier promise havinq ordered iogo company automaticaily become world ieader isguite ciear without good products effective business organization practicable aim hotat nowadays market promise marketing efforts become much effective list clear benefits creativeness hand made original logos specially done reflect distinctive company image convenience logo stationery provided formats easy use content management system letsyou change website content even structure promptness see logo drafts within three business days affordability marketing break make gaps budget 100 satisfaction guaranteed provide unlimited amount changes extra fees surethat love result collaboration look portfolio interested
Email without Stop Words : Subject stock trading gunslinger fanny merr

In [None]:
# Q - 6
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

df['Tokenized_Text'] = df['text'].apply(tokenize_text)
df.head()

Unnamed: 0,text,spam,cleaned_text,stemmed_text,lemmatized_text_spacy,lemmatized_text_nltk,Tokenized_Text
0,Subject naturally irresistible corporate ident...,1,Subject naturally irresistible your corporate ...,subject natur irresist your corpor ident lt is...,subject naturally irresistible your corporate ...,Subject naturally irresistible your corporate ...,"[Subject, naturally, irresistible, corporate, ..."
1,Subject stock trading gunslinger fanny merrill...,1,Subject the stock trading gunslinger fanny is ...,subject the stock trade gunsling fanni is merr...,subject the stock trading gunslinger fanny be ...,Subject the stock trading gunslinger fanny is ...,"[Subject, stock, trading, gunslinger, fanny, m..."
2,Subject unbelievable new homes made easy im wa...,1,Subject unbelievable new homes made easy im wa...,subject unbeliev new home made easi im want to...,subject unbelievable new home make easy I m wa...,Subject unbelievable new home made easy im wan...,"[Subject, unbelievable, new, homes, made, easy..."
3,Subject 4 color printing special request addit...,1,Subject color printing special request additio...,subject 4 color print special request addit in...,subject 4 color printing special request addit...,Subject 4 color printing special request addit...,"[Subject, 4, color, printing, special, request..."
4,Subject money get software cds software compat...,1,Subject do not have money get software cds fro...,subject do not have money get softwar cd from ...,subject do not have money get software cd from...,Subject do not have money get software cd from...,"[Subject, money, get, software, cds, software,..."


In [None]:
# Q - 7 (Built-In Function)

email_texts = df['text']

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(email_texts)

tfidf_array = tfidf_matrix.toarray()

tfidf_df = pd.DataFrame(tfidf_array, columns=vectorizer.get_feature_names_out())

print(tfidf_df)

            00  000  0000  000000  00000000  0000000000  000000000003619  \
0     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
1     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
2     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
3     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
4     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
...        ...  ...   ...     ...       ...         ...              ...   
5500  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
5501  0.032160  0.0   0.0     0.0       0.0         0.0              0.0   
5502  0.118471  0.0   0.0     0.0       0.0         0.0              0.0   
5503  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
5504  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   

      000000000003991  000000000003997  000000000005168  ...  zwzm  zxghlajf  \
0      

In [None]:
# Q - 7 (Manually)
email_texts = df['text']

def calculate_tfidf(email_texts):
    tokenized_texts = [text.split() for text in email_texts]
    vocabulary = set(word for text in tokenized_texts for word in text)

    tf = []
    for text in tokenized_texts:
        tf_text = {}
        for word in text:
            tf_text[word] = tf_text.get(word, 0) + 1
        tf.append(tf_text)

    idf = {}
    num_documents = len(tokenized_texts)
    for word in vocabulary:
        doc_count = sum(1 for text in tokenized_texts if word in text)
        idf[word] = math.log(num_documents / (1 + doc_count))

    tfidf = []
    for tf_text in tf:
        tfidf_text = {word: tf * idf[word] for word, tf in tf_text.items()}
        tfidf.append(tfidf_text)

    return tfidf

tfidf_vectors = calculate_tfidf(email_texts)

tfidf_df = pd.DataFrame(tfidf_vectors)

print(tfidf_df)

       Subject  naturally  irresistible  corporate  identity        lt  \
0    -0.000182   5.840823      7.003974   3.166675  4.182595  5.281208   
1    -0.000182        NaN           NaN        NaN       NaN       NaN   
2    -0.000182        NaN           NaN        NaN       NaN       NaN   
3    -0.000182        NaN           NaN        NaN       NaN       NaN   
4    -0.000182        NaN           NaN        NaN       NaN       NaN   
...        ...        ...           ...        ...       ...       ...   
5500 -0.000182        NaN           NaN        NaN       NaN       NaN   
5501 -0.000182        NaN           NaN        NaN       NaN       NaN   
5502 -0.000182        NaN           NaN        NaN       NaN       NaN   
5503 -0.000182        NaN           NaN        NaN       NaN       NaN   
5504 -0.000182        NaN           NaN        NaN       NaN       NaN   

        really      hard  recollect   company  ...  etringer  jonalan  laird  \
0     2.826515  3.393056   5.35

In [None]:
# Q - 8
email_texts = df['text']
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(email_texts)
specific_email_bow = bow_matrix[0].toarray()

vocabulary = vectorizer.get_feature_names_out()

print("BoW representation for the specific email:")
print(specific_email_bow)
print("Vocabulary (unique terms):")
print(vocabulary)

BoW representation for the specific email:
[[0 0 0 ... 0 0 0]]
Vocabulary (unique terms):
['00' '000' '0000' ... 'zzn' 'zzncacst' 'zzzz']
