In [1]:
import pandas as pd
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.util import ngrams
from nltk import bigrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df=pd.read_csv('spam.csv',encoding='latin1')

In [4]:
df=df.iloc[:,:2]
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Preprocessing

In [5]:
duplicates_v2=df['v2'].duplicated()
print(duplicates_v2.sum())

403


In [6]:
#removing duplicates
df_new=df.drop_duplicates(subset="v2")
df_new

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
#converting to lowercase
df_new['v2']=df['v2'].str.lower()
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2']=df['v2'].str.lower()


Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ì_ b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


In [8]:
#removing punctuation marks
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

In [9]:
df_new['v2']=df_new['v2'].apply(remove_punctuation)
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2']=df_new['v2'].apply(remove_punctuation)


Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ì b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


In [10]:
#remove numbers, html, urls, emails
def clean_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

In [11]:
df_new['v2'] = df_new['v2'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2'] = df_new['v2'].apply(clean_text)


In [12]:
#tokenizing the text
def tokenize_text(text):
    return word_tokenize(text)

In [13]:
df_new['v2']=df_new['v2'].apply(tokenize_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2']=df_new['v2'].apply(tokenize_text)


In [14]:
df_new

Unnamed: 0,v1,v2
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, dont, think, he, goes, to, usf, he, l..."
...,...,...
5567,spam,"[this, is, the, nd, time, we, have, tried, con..."
5568,ham,"[will, ì, b, going, to, esplanade, fr, home]"
5569,ham,"[pity, was, in, mood, for, that, soany, other,..."
5570,ham,"[the, guy, did, some, bitching, but, i, acted,..."


In [15]:
#Removing stop words
stop_words=set(stopwords.words('english'))

In [16]:
def remove_stopwords(word_list):
    return [word for word in word_list if word not in stop_words]

In [17]:
df_new['v2']=df_new['v2'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2']=df_new['v2'].apply(remove_stopwords)


In [18]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [19]:
df_new['v2'] = df_new['v2'].apply(lemmatize_tokens)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2'] = df_new['v2'].apply(lemmatize_tokens)


In [20]:
df_new

Unnamed: 0,v1,v2
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, go, usf, life, around, though]"
...,...,...
5567,spam,"[nd, time, tried, contact, u, u, å£, pound, pr..."
5568,ham,"[ì, b, going, esplanade, fr, home]"
5569,ham,"[pity, mood, soany, suggestion]"
5570,ham,"[guy, bitching, acted, like, id, interested, b..."


# Ngrams-1,2,3

In [21]:
filtered_ham = df_new[df_new['v1'] == 'ham']['v2'].explode().tolist()
filtered_spam = df_new[df_new['v1'] == 'spam']['v2'].explode().tolist()

In [22]:
def generate_ngrams(words,n):
    return list(ngrams(words,n))

In [23]:
ngram_ham=generate_ngrams(filtered_ham,1)
ngram_spam=generate_ngrams(filtered_spam,1)
ngram_ham2=generate_ngrams(filtered_ham,2)
ngram_spam2=generate_ngrams(filtered_spam,2)
ngram_ham3=generate_ngrams(filtered_ham,3)
ngram_spam3=generate_ngrams(filtered_spam,3)

In [24]:
ngram_counts_ham=Counter(ngram_ham)
ngram_counts_spam=Counter(ngram_spam)
ngram_counts_ham2=Counter(ngram_ham2)
ngram_counts_spam2=Counter(ngram_spam2)
ngram_counts_ham3=Counter(ngram_ham3)
ngram_counts_spam3=Counter(ngram_spam3)

In [25]:
top_10_ham=ngram_counts_ham.most_common(10)
print(top_10_ham)

[(('u',), 941), (('im',), 436), (('get',), 308), (('go',), 267), (('ltgt',), 254), (('ok',), 246), (('dont',), 242), (('got',), 238), (('know',), 229), (('come',), 223)]


In [26]:
top_10_spam=ngram_counts_spam.most_common(10)
print(top_10_spam)

[(('call',), 316), (('å£',), 221), (('free',), 190), (('u',), 137), (('txt',), 126), (('text',), 121), (('ur',), 119), (('mobile',), 115), (('stop',), 106), (('claim',), 98)]


In [27]:
top_10_ham2=ngram_counts_ham2.most_common(10)
print("top 10 bigrams in ham:",top_10_ham2)

top 10 bigrams in ham: [(('gon', 'na'), 58), (('let', 'know'), 41), (('r', 'u'), 35), (('u', 'r'), 31), (('dont', 'know'), 30), (('wan', 'na'), 28), (('new', 'year'), 28), (('u', 'wan'), 28), (('take', 'care'), 26), (('im', 'going'), 24)]


In [28]:
top_10_ham3=ngram_counts_ham3.most_common(10)
print("top 10 trigrams in ham: ",top_10_ham3)

top 10 trigrams in ham:  [(('im', 'gon', 'na'), 20), (('happy', 'new', 'year'), 17), (('ill', 'call', 'later'), 11), (('hi', 'hi', 'hi'), 11), (('sorry', 'ill', 'call'), 8), (('wat', 'time', 'u'), 8), (('x', 'x', 'x'), 8), (('kiss', 'across', 'sea'), 6), (('u', 'wan', 'go'), 6), (('u', 'dun', 'wan'), 6)]


In [29]:
top_10_spam2=ngram_counts_spam2.most_common(10)
print("top 10 bigrams in spam:",top_10_spam2)

top 10 bigrams in spam: [(('å£', 'prize'), 41), (('please', 'call'), 41), (('å£', 'cash'), 36), (('po', 'box'), 23), (('customer', 'service'), 22), (('contact', 'u'), 22), (('call', 'landline'), 22), (('p', 'per'), 22), (('guaranteed', 'call'), 21), (('prize', 'guaranteed'), 20)]


In [30]:
top_10_spam3=ngram_counts_spam3.most_common(10)
print("top 10 trigrams in spam: ",top_10_spam3)

top 10 trigrams in spam:  [(('å£', 'prize', 'guaranteed'), 19), (('prize', 'guaranteed', 'call'), 19), (('call', 'land', 'line'), 16), (('private', 'account', 'statement'), 15), (('call', 'identifier', 'code'), 14), (('draw', 'show', 'å£'), 13), (('identifier', 'code', 'expires'), 13), (('guaranteed', 'call', 'land'), 13), (('show', 'å£', 'prize'), 12), (('account', 'statement', 'show'), 12)]


# Log Reg with Bag of words vectoriser

In [31]:
vectorizer=CountVectorizer()

In [32]:
df_new['v2_text'] = df_new['v2'].apply(lambda tokens: ' '.join(tokens))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['v2_text'] = df_new['v2'].apply(lambda tokens: ' '.join(tokens))


In [33]:
X=vectorizer.fit_transform(df_new['v2_text'])

In [34]:
bow_df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,ìï,ìïll,ûthanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
y = df_new['v1'].map({'ham': 0, 'spam': 1})

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
model=LogisticRegression()

In [38]:
model.fit(X_train,y_train)

LogisticRegression()

In [39]:
y_pred=model.predict(X_test)

In [40]:
accuracy=accuracy_score(y_test,y_pred)

In [41]:
print(accuracy)

0.9796905222437138


In [42]:
f1_=f1_score(y_test,y_pred)
print(f1_)

0.9225092250922509


# Log Reg with Tf-IDF vectoriser

In [43]:
vectorizer_tf = TfidfVectorizer()

In [44]:
X_tf = vectorizer_tf.fit_transform(df_new['v2_text'])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_tf, y, test_size=0.2, random_state=42)

In [46]:
model = LogisticRegression()

In [47]:
model.fit(X_train, y_train)

LogisticRegression()

In [48]:
y_pred = model.predict(X_test)

In [49]:
f1 = f1_score(y_test, y_pred)
print(f1)

0.7642276422764228
