In [1]:
# TF (Term Frequency) is a measure of how frequently a term appears in a document. It is calculated by dividing the number of occurrences of a term in a document by the total number of terms in that document.

# TF = (Number of occurrences of a term in a document) / (Total number of terms in the document)

# IDF (Inverse Document Frequency) is a measure of how important a term is in a collection of documents. It is calculated by dividing the total number of documents in the collection by the number of documents that contain the term, and then taking the logarithm of that ratio.

# IDF = log((Total number of documents) / (Number of documents containing the term))

# These formulas are commonly used in Natural Language Processing (NLP) to quantify the importance of terms in a document or a collection of documents.

In [5]:
import pandas as pd
import re
data = pd.read_csv("spam.csv", encoding="latin1",usecols=['v1','v2'])
data['v2']=data['v2'].apply(lambda x: x.lower() )
data['v2'] = data['v2'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
data.head()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
import string

def text_processing(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words]
    
    # Removing named entities
    tagged_tokens = pos_tag(filtered_tokens)
    named_entities = ne_chunk(tagged_tokens, binary=True)
    named_entities_removed = [token for token in named_entities if not isinstance(token, nltk.tree.Tree)]
    
    # Removing punctuation
    punctuation_removed = [token for token in named_entities_removed if token[0] not in string.punctuation]
    word_list = [token[0] for token in punctuation_removed]
    word_list=' '.join(word_list)
    return word_list

In [6]:
data['Processed_text']=data['v2'].apply(text_processing)
data.head()

Unnamed: 0,v1,v2,Processed_text
0,ham,go until jurong point crazy available only in ...,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say so early hor u c already then say,u dun say early hor u c already say
4,ham,nah i dont think he goes to usf he lives aroun...,nah dont think go usf life around though


In [7]:
corpus=list(data['Processed_text'])
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s',
 'u dun say early hor u c already say',
 'nah dont think go usf life around though',
 'freemsg hey darling 3 week word back id like fun still tb ok xxx std chgs send 150 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam ha set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'im gon na home soon dont want talk stuff anymore tonight k ive cried enough today',
 'six chance win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6days 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec_tfid=TfidfVectorizer(max_features=100)
X=vec_tfid.fit_transform(corpus).toarray()
vec_tfid.vocabulary_



{'go': 20,
 'great': 24,
 'got': 23,
 'wat': 90,
 'ok': 58,
 'free': 16,
 'win': 94,
 'text': 77,
 'txt': 85,
 'say': 68,
 'already': 0,
 'dont': 14,
 'think': 81,
 'life': 38,
 'hey': 27,
 'week': 92,
 'back': 4,
 'like': 39,
 'still': 73,
 'send': 70,
 'ha': 25,
 'friend': 17,
 'prize': 63,
 'claim': 7,
 'call': 5,
 'mobile': 48,
 'co': 8,
 'im': 32,
 'na': 52,
 'home': 29,
 'want': 89,
 'ive': 33,
 'today': 83,
 'reply': 65,
 'right': 66,
 'take': 75,
 'time': 82,
 'message': 45,
 'oh': 57,
 'yes': 99,
 'make': 43,
 'thats': 79,
 'way': 91,
 'miss': 47,
 'ur': 86,
 'going': 21,
 'da': 10,
 'lor': 40,
 'meet': 44,
 'really': 64,
 'know': 34,
 'love': 41,
 'amp': 1,
 'ill': 31,
 'let': 37,
 'work': 95,
 'yeah': 97,
 'wa': 87,
 'tell': 76,
 'thanks': 78,
 'please': 61,
 'msg': 50,
 'see': 69,
 'pls': 62,
 'need': 53,
 'tomorrow': 84,
 'hope': 30,
 'well': 93,
 'ltgt': 42,
 'didnt': 13,
 'get': 18,
 'cant': 6,
 'ask': 2,
 'morning': 49,
 'happy': 26,
 'sorry': 72,
 'give': 19,
 'new': 5

In [16]:
#Using Ngrams
vec_tfid=TfidfVectorizer(max_features=100,ngram_range=(2,2))
X=vec_tfid.fit_transform(corpus).toarray()
vec_tfid.vocabulary_



{'free entry': 32,
 'dont think': 25,
 'claim call': 18,
 'claim code': 19,
 'free call': 31,
 'im gon': 51,
 'gon na': 36,
 'dont want': 26,
 'chance win': 17,
 'txt word': 89,
 'im going': 50,
 'let know': 57,
 'please call': 69,
 'call 08000930705': 9,
 'dont know': 24,
 'want go': 97,
 'like ltgt': 58,
 'im sorry': 53,
 'sorry ill': 82,
 'ill call': 49,
 'call later': 11,
 'ur awarded': 90,
 'im home': 52,
 'wan na': 95,
 'hi hi': 46,
 'got ta': 40,
 'call customer': 10,
 'customer service': 22,
 '1000 cash': 0,
 'trying contact': 86,
 'draw show': 28,
 'prize guaranteed': 75,
 'guaranteed call': 42,
 'valid 12hrs': 94,
 'selected receive': 78,
 'account statement': 6,
 'identifier code': 48,
 'urgent mobile': 93,
 'caller prize': 13,
 '350 award': 4,
 'wat time': 98,
 'ur mob': 92,
 'new year': 63,
 'send stop': 80,
 'nice day': 64,
 'txt nokia': 87,
 'good morning': 38,
 'ur friend': 91,
 'good night': 39,
 'get back': 34,
 'tried contact': 85,
 'network min': 62,
 'reply call': 