In [33]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
import numpy as np

In [34]:
# Step 1: Read csv file
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [35]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [36]:
# Step 2: Remove punctuations
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data['msg_nopunct'] = data['v2'].apply(remove_punctuations)
data['msg_nopunct'][1]

'Ok lar Joking wif u oni'

In [37]:
# Step 3: Remove stop words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.casefold() not in stop_words]
    return ' '.join(filtered_words)

data['msg_nostopwords'] = data['msg_nopunct'].apply(remove_stopwords)
data['msg_nostopwords'][1] 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'Ok lar Joking wif u oni'

In [38]:
# Step 4: Lower case
data['msg_lower'] = data['msg_nostopwords'].apply(lambda x: x.lower())
data['msg_lower'][1]

'ok lar joking wif u oni'

In [39]:
# Step 5: Tokenization
def tokenize(text):
    return re.split('\W+', text)

data['msg_tokenized'] = data['msg_lower'].apply(tokenize)
data['msg_tokenized'][1] 

['ok', 'lar', 'joking', 'wif', 'u', 'oni']

In [40]:
# Step 6: Stemming
stemmer = PorterStemmer()
def stem_words(words):
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

data['msg_stemmed'] = data['msg_tokenized'].apply(stem_words)
data['msg_stemmed'][1]

['ok', 'lar', 'joke', 'wif', 'u', 'oni']

In [41]:
# Step 7: Lemmatization
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(words):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

data['msg_lemmatized'] = data['msg_tokenized'].apply(lemmatize_words)
data['msg_lemmatized'][1] 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['ok', 'lar', 'joking', 'wif', 'u', 'oni']

In [42]:
# Step 8: POS tagging
import nltk
nltk.download('averaged_perceptron_tagger')
def pos_tagging(text):
    blob = TextBlob(text)
    return blob.tags

data['msg_pos'] = data['msg_lower'].apply(pos_tagging)
data['msg_pos'][1] 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('ok', 'JJ'),
 ('lar', 'JJ'),
 ('joking', 'NN'),
 ('wif', 'NN'),
 ('u', 'JJ'),
 ('oni', 'NN')]

In [47]:
# Step 9: Calculating TF-IDF
N = len(data)
def doc_freq(word):
    df = sum(1 for text in data['msg_lemmatized'] if word in text)
    return df

def calc_tf_idf(doc, token):
    words_count = len(data['msg_lemmatized'][doc])
    counter = dict(nltk.FreqDist(data['msg_lemmatized'][doc]))
    tf = counter[token]/words_count
    df = doc_freq(token)
    idf = np.log(N/(df+1))
    tf_idf = tf*idf
    return tf_idf

In [48]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,msg_nopunct,msg_nostopwords,msg_lower,msg_tokenized,msg_stemmed,msg_lemmatized,msg_pos
0,ham,"Go until jurong point, crazy.. Available only ...",,,,Go until jurong point crazy Available only in ...,Go jurong point crazy Available bugis n great ...,go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazy, available, bugis, n...","[(go, VB), (jurong, JJ), (point, NN), (crazy, ..."
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar Joking wif u oni,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[(ok, JJ), (lar, JJ), (joking, NN), (wif, NN),..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[(free, JJ), (entry, NN), (2, CD), (wkly, JJ),..."
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor U c already then say,U dun say early hor U c already say,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]","[(u, JJ), (dun, NNS), (say, VBP), (early, JJ),..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,Nah I dont think he goes to usf he lives aroun...,Nah dont think goes usf lives around though,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]","[(nah, JJ), (dont, NN), (think, NN), (goes, VB..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u U å£750 Pound prize...,2nd time tried 2 contact u u å£750 pound prize...,"[2nd, time, tried, 2, contact, u, u, å, 750, p...","[2nd, time, tri, 2, contact, u, u, å, 750, pou...","[2nd, time, tried, 2, contact, u, u, å, 750, p...","[(2nd, CD), (time, NN), (tried, VBD), (2, CD),..."
5568,ham,Will Ì_ b going to esplanade fr home?,,,,Will Ì b going to esplanade fr home,Ì b going esplanade fr home,ì b going esplanade fr home,"[ì, b, going, esplanade, fr, home]","[ì, b, go, esplanad, fr, home]","[ì, b, going, esplanade, fr, home]","[(ì, NN), (b, NN), (going, VBG), (esplanade, J..."
5569,ham,"Pity, * was in mood for that. So...any other s...",,,,Pity was in mood for that Soany other suggest...,Pity mood Soany suggestions,pity mood soany suggestions,"[pity, mood, soany, suggestions]","[piti, mood, soani, suggest]","[pity, mood, soany, suggestion]","[(pity, NN), (mood, NN), (soany, JJ), (suggest..."
5570,ham,The guy did some bitching but I acted like i'd...,,,,The guy did some bitching but I acted like id ...,guy bitching acted like id interested buying s...,guy bitching acted like id interested buying s...,"[guy, bitching, acted, like, id, interested, b...","[guy, bitch, act, like, id, interest, buy, som...","[guy, bitching, acted, like, id, interested, b...","[(guy, NN), (bitching, VBG), (acted, VBN), (li..."


In [45]:
data['msg_lemmatized'][0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [49]:
# Example usage
for i in data['msg_lemmatized'][0]:
  tf_idf = calc_tf_idf(doc=0, token=i)
  print(i," ", tf_idf)

go   0.18559484525499023
jurong   0.49577263464623444
point   0.31869680064272093
crazy   0.3698411958623429
available   0.3620184994277175
bugis   0.40912923707624127
n   0.23987609950735314
great   0.2482218115463858
world   0.31688507958814266
la   0.40912923707624127
e   0.26842349966333534
buffet   0.47043106538947416
cine   0.40912923707624127
got   0.1968153614355116
amore   0.49577263464623444
wat   0.2558068651091525
