In [1]:
import pandas as pd
import numpy as np
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score


## Data Loading

In [2]:
data = pd.read_csv("DataSet/spam.csv")

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## EDA 

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [23]:
data.shape

(5572, 2)

In [5]:
data['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
stop_words = nlp.Defaults.stop_words
len(stop_words)
stop_words.remove('do')
nlp.vocab['do'].is_stop=False
# stop_words.add('offer')
# nlp.vocab['offer'].is_stop=True

In [8]:
corpus=[]
for i in range(0,data.shape[0]):
    words=[]
    doc = nlp(data['Message'][i])
    for token in doc:
        if(token.lemma_ not in stop_words):
            words.append(token.lemma_)
    new_text = " ".join(words)
    corpus.append(new_text)


In [9]:
new_text

'rofl . true'

In [10]:
corpus

['jurong point , crazy .. available bugis n great world la e buffet ... Cine amore wat ...',
 'ok lar ... joke wif u oni ...',
 'free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . text fa 87121 receive entry question(std txt rate)T&C apply 08452810075over18',
 'u dun early hor ... u c ...',
 'Nah I do think usf , live',
 'FreeMsg hey darle 3 week word ! I like fun ? tb ok ! XxX std chgs send , £ 1.50 rcv',
 'brother like speak I . treat I like aids patent .',
 "request ' Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . press * 9 copy friend Callertune",
 'WINNER ! ! value network customer select receivea £ 900 prize reward ! claim 09061701461 . claim code KL341 . valid 12 hour .',
 'mobile 11 month ? U R entitle update late colour mobile camera free ! Mobile Update Co free 08002986030',
 'I home soon I do want talk stuff anymore tonight , k ? I cry today .',
 'chance win cash ! 100 20,000 pound txt > CSH11 send 87575 . Cost 150p / day , 6days , 16

In [11]:
data["Message"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [12]:
doc

Rofl. Its true to its name

In [13]:
print(stop_words)

{'from', 'anyway', 'must', 'amongst', 'sixty', 'always', 'done', 'because', 'ten', 'keep', 'neither', 'third', '‘s', 'that', 'when', 'across', 'ca', 'under', 'be', 'became', 'not', 'where', 'been', 'cannot', 'amount', 'n’t', 'serious', 'among', 'something', 'for', 'of', 'one', 'hereupon', 'my', 'until', 'quite', 'wherein', 'around', 'never', 'through', 'using', 'but', 'nowhere', 'show', 'as', 'than', 'at', "'ll", 'yours', 'throughout', 're', 'had', 'give', 'anywhere', 'moreover', 'being', 'his', 'within', '‘ll', 'sometime', 'just', 'mine', 'enough', 'upon', 'nobody', 'afterwards', 'two', 'former', 'again', 'am', 'would', 'become', 'by', 'or', 'name', 'meanwhile', 'both', 'though', 'another', 'an', 'here', 'she', 'six', "'s", 'up', 'thence', 'either', 'move', 'whenever', 'each', 'now', '’ve', "'ve", 'otherwise', '’re', 'eleven', 'hers', 'same', 'already', 'used', 'have', 'top', 'elsewhere', 'hence', 'you', 'whose', 'doing', 'twenty', 'nine', 'thru', 'therefore', 'those', 'made', 'alone'

## Feature Engg

In [14]:
#from sklearn.feature_extraction.text import TfidfVectorizer
tf =TfidfVectorizer()
output =tf.fit_transform(corpus)
output

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 41964 stored elements and shape (5572, 7610)>

In [15]:
output = output.toarray()

In [16]:
print(tf.get_feature_names_out()[2000:2050])

['completely' 'complexity' 'compliment' 'complimentary' 'comprehensive'
 'compromise' 'compulsory' 'computational' 'computer' 'computerless'
 'comuk' 'con' 'conacte' 'concentrate' 'concentration' 'concern'
 'concerned' 'concert' 'conclusion' 'condition' 'conduct' 'conected'
 'conference' 'confidence' 'configure' 'confirm' 'confirmd' 'conform'
 'confuse' 'confused' 'congrat' 'congrats' 'congratulation' 'connect'
 'connection' 'consensus' 'consent' 'conserve' 'consider' 'consistently'
 'console' 'constant' 'constantly' 'contact' 'contain' 'content'
 'contention' 'continent' 'continue' 'continued']


In [17]:
temp =pd.DataFrame(output)
temp[temp[2003]!=0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7600,7601,7602,7603,7604,7605,7606,7607,7608,7609
67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
844,0.0,0.207236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.215372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3382,0.0,0.251535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4183,0.0,0.210187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print(data['Message'][844])

Urgent! call 09066350750 from your landline. Your complimentary 4* Ibiza Holiday or 10,000 cash await collection SAE T&Cs PO BOX 434 SK3 8WP 150 ppm 18+


## Data Splitting

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(output,data['Category'],test_size=0.3,random_state=45)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
model =MultinomialNB()
model.fit(X_train,y_train)
print('training acc',accuracy_score(model.predict(X_train),y_train))
print('testing acc',accuracy_score(model.predict(X_test),y_test))

training acc 0.978974358974359
testing acc 0.9623205741626795


In [21]:
mag = 'Dear Customer,When thoughtful leadership sparks meaningful conversations — we listen.Your overwhelming response to our earlier Leadership Talks told us one thing loud and clear:you want to hear more about the ideas, intent, and innovation shaping the future of banking.'
doc = nlp(mag)
for token in doc:
    if(token.lemma_ not in stop_words):
        words.append(token.lemma_)

new_text = ' '.join(words)
input_data = tf.transform([new_text])
model.predict(input_data)

array(['ham'], dtype='<U4')

## Save Model

In [24]:
import pickle

pickle.dump(model, open("spam_model.pkl", "wb"))
pickle.dump(tf, open("tfidf_vectorizer.pkl", "wb"))


## Create Streamlit App