In [1]:
!pip install pandas scikit-learn nltk

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/17/1c/ccdd103cfcc9435a18819856fbbe0c20b8fa60bfc3343580de4be13f0668/scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting nltk
  Obtaining dependency information for nltk from https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl.metadata
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.23.2 (from pandas)
  Obtaining dependency information for numpy>=1.23.2 from https://files.pythonho


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./../dataset/spam.csv', encoding = 'ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.shape

(5572, 5)

In [6]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [7]:
df['v2'] = df['v2'].map(lambda text : re.sub('[^a-zA-Z0-9]', ' ', text)).apply(lambda x : (x.lower()).split())

In [8]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"[go, until, jurong, point, crazy, available, o...",,,
1,ham,"[ok, lar, joking, wif, u, oni]",,,
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",,,
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",,,
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",,,


In [10]:
corpus = df['v2'].apply(lambda text : ' '.join(list(map(lambda w : ps.stem(w), \
                                                        (list(filter(lambda text : text not in set(stopwords.words('english')),\
                                                                                            text)))))))

In [11]:
corpus

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u 750 pound prize 2 c...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: v2, Length: 5572, dtype: object

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [13]:
X_data = tfidf.fit_transform(corpus.values).toarray()

In [14]:
X_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
X_data.shape

(5572, 7163)

In [16]:
y_data = df.v1

In [17]:
y_data.shape

(5572,)

In [18]:
y_data.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2)

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
model = MultinomialNB(alpha = 1.0, fit_prior = True)

In [24]:
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9695067264573991

In [27]:
from joblib import dump

In [29]:
dump(tfidf, './../savedModels/tfidf.joblib')

['./../savedModels/tfidf.joblib']

In [30]:
dump(model, './../savedModels/model.joblib')

['./../savedModels/model.joblib']