---

> **[Dataset Link](https://archive.ics.uci.edu/dataset/228/sms+spam+collection)**

---





In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['label', 'message'])

In [3]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
!pip install gensim



In [4]:

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# Preprocesing

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:
def convert_to_corpus(text:str):
  corpus = []
  text = text.lower()
  text = re.sub(r'(?:https?|ftp|ssh)://\S+', 'url', text)
  text = re.sub(r'<.*?>', ' ', text)
  text = re.sub('[^a-zA-Z]', ' ', text)
  words = text.split()
  words = [lemmatizer.lemmatize(word=word, pos='v') for word in words if word not in stop_words]
  return ' '.join(words)

In [8]:
data['message'] = data['message'].apply(convert_to_corpus)

In [9]:
data.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joke wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf live around though


In [10]:
# Word to Vector

In [11]:
# Let's Test On BOG

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [14]:
data['label'] = data['label'].map({'ham':0, 'spam':1})

In [16]:
data['label'].value_counts() #So Data Set Is Imbalance

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4825
1,747


In [17]:
X, y = data['message'], data['label']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline = Pipeline(steps=[
    ('cv', CountVectorizer(binary=True, ngram_range=(1,2))),
     ('rf', RandomForestClassifier())
     ])
grid_bow = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30]
    }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, grid_bow, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)
model_1 = grid_search.best_estimator_
y_pred = model_1.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

{'rf__max_depth': None, 'rf__n_estimators': 300}
0.9674668961664479
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.78      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

[[966   0]
 [ 33 116]]


In [None]:
pipeline = Pipeline(steps=[
    ('cv', CountVectorizer(binary=True)),
     ('rf', BernoulliNB())
     ])
grid_bow = {
    'rf__alpha': [0.1, 0.5, 1.0],
    'cv__ngram_range': [(1,1), (1,2), (2,3)],
    'cv__max_features': [1000, 5000, 10000]
    }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, grid_bow, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)
model_2 = grid_search.best_estimator_
y_pred = model_2.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

{'cv__max_features': 5000, 'cv__ngram_range': (1, 1), 'rf__alpha': 0.1}
0.9867619896020493
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.91      0.94       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[964   2]
 [ 14 135]]


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

In [None]:
pipeline = imbpipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=42)),
     ('rf', RandomForestClassifier(n_estimators=300))
     ])
grid_bow = {
    'tfidf__ngram_range': [(1,1), (1,2), (2,3)],
    'tfidf__max_features': [1000, 5000, 10000]
    }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, grid_bow, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)
model_3 = grid_search.best_estimator_
y_pred = model_3.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

{'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
0.9786836979662091
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[966   0]
 [ 20 129]]


In [None]:
import joblib

joblib.dump(model_3, 'spam_classifier.pkl')

['spam_classifier.pkl']

In [29]:
pipeline = Pipeline(steps=[
    ('cv', CountVectorizer(binary=True, max_features=5000, ngram_range=(1,1))),
     ('rf', BernoulliNB(alpha=0.1))
     ])
pipeline.fit(X_train, y_train)

In [30]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

[[964   2]
 [ 10 139]]


In [26]:
from sklearn.model_selection import cross_validate

metrics = cross_validate(pipeline, X_train, y_train, cv=5,
                         scoring=['accuracy', 'precision', 'recall', 'f1'],
                         return_train_score=False)

print("Mean recall Score:", metrics['test_recall'].mean())

print("Mean recall Score:", metrics['test_precision'].mean())

Mean recall Score: 0.902983193277311
Mean recall Score: 0.9856614030298241


In [31]:
import joblib

joblib.dump(pipeline, 'spam_classifier.pkl')

['spam_classifier.pkl']