In [1]:
import pandas as pd
import numpy as np

print(np.__version__)
print(pd.__version__)


1.26.4
2.2.3


In [2]:
!pip install openpyxl



In [3]:
data = pd.read_excel('spam_data.xlsx')
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Regular Expression

In [4]:
import re
def keep_special_characters(v2):
    return re.sub(r'[^a-zA-Z0-9\s.,!?]','', v2)


In [5]:
data['clean_v2'] = data['v2'].astype(str).apply(keep_special_characters)
data

Unnamed: 0,v1,v2,clean_v2
0,ham,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I dont think he goes to usf, he lives arou..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?,Will b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...","Pity, was in mood for that. So...any other su..."
5570,ham,The guy did some bitching but I acted like i'd...,The guy did some bitching but I acted like id ...


# Normalization

In [6]:
data['clean_v2'] = data['clean_v2'].str.lower()
data

Unnamed: 0,v1,v2,clean_v2
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i dont think he goes to usf, he lives arou..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...","pity, was in mood for that. so...any other su..."
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...


# stemming

In [7]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()

In [8]:
data['clean_v2']


0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i dont think he goes to usf, he lives arou...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will  b going to esplanade fr home?
5569    pity,  was in mood for that. so...any other su...
5570    the guy did some bitching but i acted like id ...
5571                           rofl. its true to its name
Name: clean_v2, Length: 5572, dtype: object

In [9]:
def stemming(v2):
    ps = PorterStemmer()
    words = v2.split() #we use split here because tokenizer has not worked here.anyhow split words.
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)


In [10]:

data['stemmed_v2'] = data['clean_v2'].apply(stemming)
data

Unnamed: 0,v1,v2,clean_v2,stemmed_v2
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ...","go until jurong point, crazy.. avail onli in b..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...,ok lar... joke wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri in 2 a wkli comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...,u dun say so earli hor... u c alreadi then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i dont think he goes to usf, he lives arou...","nah i dont think he goe to usf, he live around..."
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,thi is the 2nd time we have tri 2 contact u. u...
5568,ham,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home?,will b go to esplanad fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...","pity, was in mood for that. so...any other su...","pity, wa in mood for that. so...ani other sugg..."
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,the guy did some bitch but i act like id be in...


In [11]:
import spacy
print(f"spaCy version: {spacy.__version__}")


spaCy version: 3.7.2


In [12]:
pip install en_core_web_sm

Note: you may need to restart the kernel to use updated packages.


In [13]:
import spacy
try:
    nlp = spacy.load("en_core_web_sm")
    print("Model 'en_core_web_sm' is installed and loaded successfully.")
except OSError:
    print("Model 'en_core_web_sm' is not installed.")


Model 'en_core_web_sm' is installed and loaded successfully.


# Lemmatization

In [14]:
# Lemmatization using spaCy
def lemmatization_spacy(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_words)

data['lemmatized_text_Spicy'] = data['clean_v2'].apply(lemmatization_spacy)
data


Unnamed: 0,v1,v2,clean_v2,stemmed_v2,lemmatized_text_Spicy
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ...","go until jurong point, crazy.. avail onli in b...","go until jurong point , crazy .. available onl..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...,ok lar... joke wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri in 2 a wkli comp to win fa cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...,u dun say so earli hor... u c alreadi then say...,u dun say so early hor ... u c already then sa...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i dont think he goes to usf, he lives arou...","nah i dont think he goe to usf, he live around...","nah I do not think he go to usf , he live arou..."
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,thi is the 2nd time we have tri 2 contact u. u...,this be the 2nd time we have try 2 contact u. ...
5568,ham,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home?,will b go to esplanad fr home?,will b go to esplanade fr home ?
5569,ham,"Pity, * was in mood for that. So...any other s...","pity, was in mood for that. so...any other su...","pity, wa in mood for that. so...ani other sugg...","pity , be in mood for that . so ... any othe..."
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,the guy did some bitch but i act like id be in...,the guy do some bitching but I act like i d be...


# TOkenization

In [15]:
 pip install --upgrade nltk


Note: you may need to restart the kernel to use updated packages.


In [17]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [18]:
import nltk
from nltk.tokenize import word_tokenize


text = "This is a test sentence."
tokens = word_tokenize(text)
print(tokens)


['This', 'is', 'a', 'test', 'sentence', '.']


In [19]:
from nltk.tokenize import word_tokenize, sent_tokenize

# Example text
text = "Natural Language Processing with Python is amazing!"

# Tokenize the text
tokens = word_tokenize(text)
sentences = sent_tokenize(text)

print("Word Tokens:", tokens)
print("Sentence Tokens:", sentences)


Word Tokens: ['Natural', 'Language', 'Processing', 'with', 'Python', 'is', 'amazing', '!']
Sentence Tokens: ['Natural Language Processing with Python is amazing!']


In [20]:
from nltk.tokenize import word_tokenize

def tokenize_text(clean_v2):
    return word_tokenize(clean_v2) #word tokenize

def segment_sentences(clean_v2):
    return sent_tokenize(clean_text)


data['tokens'] = data['clean_v2'].apply(tokenize_text)
data
    

Unnamed: 0,v1,v2,clean_v2,stemmed_v2,lemmatized_text_Spicy,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ...","go until jurong point, crazy.. avail onli in b...","go until jurong point , crazy .. available onl...","[go, until, jurong, point, ,, crazy, .., avail..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...,ok lar... joke wif u oni...,ok lar ... joke wif u oni ...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri in 2 a wkli comp to win fa cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...,u dun say so earli hor... u c alreadi then say...,u dun say so early hor ... u c already then sa...,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i dont think he goes to usf, he lives arou...","nah i dont think he goe to usf, he live around...","nah I do not think he go to usf , he live arou...","[nah, i, dont, think, he, goes, to, usf, ,, he..."
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,thi is the 2nd time we have tri 2 contact u. u...,this be the 2nd time we have try 2 contact u. ...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home?,will b go to esplanad fr home?,will b go to esplanade fr home ?,"[will, b, going, to, esplanade, fr, home, ?]"
5569,ham,"Pity, * was in mood for that. So...any other s...","pity, was in mood for that. so...any other su...","pity, wa in mood for that. so...ani other sugg...","pity , be in mood for that . so ... any othe...","[pity, ,, was, in, mood, for, that, ., so, ......"
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,the guy did some bitch but i act like id be in...,the guy do some bitching but I act like i d be...,"[the, guy, did, some, bitching, but, i, acted,..."
