In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [31]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.pipeline import Pipeline
import pickle
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
df = pd.read_csv('/content/drive/MyDrive/ML/NLP/SMSSpamCollection.txt',sep='\t',names=["label","message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [13]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [14]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [15]:
lm = WordNetLemmatizer()
corpus = []
for i in range(0,len(df)):
  review = re.sub('[^a-zA-z]',' ',df["message"][i])
  review = review.lower()
  review = review.split()
  review = [lm.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [25]:
y = pd.get_dummies(df['label'])
y = y.iloc[:,0].values
y = list(map(int, y))

In [26]:
X_train,X_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,stratify=y)

In [27]:
# BOW
sms_pipeline_bow = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

cv = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(sms_pipeline_bow, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

sms_pipeline_bow.fit(X_train,y_train)
with open('/content/drive/MyDrive/ML/NLP/sms_pipeline_bow.pkl','wb') as file:
  pickle.dump(sms_pipeline_bow,file)

Cross-validation Accuracy: 0.98 ± 0.00


In [28]:
with open('/content/drive/MyDrive/ML/NLP/sms_pipeline_bow.pkl','rb') as file:
  pipeline_bow = pickle.load(file)

y_pred_bow = pipeline_bow.predict(X_test)

print(classification_report(y_test,y_pred_bow))
print(confusion_matrix(y_test,y_pred_bow))
print(accuracy_score(y_test,y_pred_bow))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       149
           1       0.99      0.99      0.99       966

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[140   9]
 [  6 960]]
0.9865470852017937


In [29]:
# TF-IDF

sms_pipeline_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

cv = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(sms_pipeline_tfidf, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

sms_pipeline_tfidf.fit(X_train,y_train)

with open('/content/drive/MyDrive/ML/NLP/sms_pipeline_tfidf.pkl','wb') as file:
  pickle.dump(sms_pipeline_tfidf,file)

Cross-validation Accuracy: 0.96 ± 0.01


In [30]:
with open('/content/drive/MyDrive/ML/NLP/sms_pipeline_tfidf.pkl','rb') as file:
  pipeline_tfidf = pickle.load(file)

y_pred_tfidf = pipeline_tfidf.predict(X_test)

print(classification_report(y_test,y_pred_tfidf))
print(confusion_matrix(y_test,y_pred_tfidf))
print(accuracy_score(y_test,y_pred_tfidf))

              precision    recall  f1-score   support

           0       1.00      0.81      0.89       149
           1       0.97      1.00      0.99       966

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

[[120  29]
 [  0 966]]
0.9739910313901345


In [32]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
import pickle

# Download NLTK data (if not already installed)
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/ML/NLP/SMSSpamCollection.txt', sep='\t', names=["label", "message"])

# Cleaning and preprocessing
lm = WordNetLemmatizer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df["message"][i])
    review = review.lower()
    review = review.split()
    review = [lm.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Encoding labels
y = pd.get_dummies(df['label'])
y = y.iloc[:, 0].values
y = list(map(int, y))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, stratify=y)

# Train Word2Vec model
w2v_model = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=100, window=5, min_count=1, workers=4)

# Custom transformer for Word2Vec
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return self.compute_avg_w2v(X)

    def compute_avg_w2v(self, tokens):
        avg_vector = []
        for review in tokens:
            vectors = [self.model.wv[word] for word in review.split() if word in self.model.wv]
            if vectors:
                avg_vector.append(sum(vectors) / len(vectors))
            else:
                avg_vector.append([0] * self.vector_size)
        return avg_vector

# Create pipeline
pipeline = Pipeline([
    ('w2v_transform', Word2VecTransformer(w2v_model)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Save the model as a pickle file
with open('/content/drive/MyDrive/ML/NLP/sms_pipeline_w2v.pkl','wb') as file:
  pickle.dump(pipeline,file)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
with open('/content/drive/MyDrive/ML/NLP/sms_pipeline_w2v.pkl','rb') as file:
  pipeline_w2v = pickle.load(file)

y_pred_w2v = pipeline_w2v.predict(X_test)

print(classification_report(y_test,y_pred_w2v))
print(confusion_matrix(y_test,y_pred_w2v))
print(accuracy_score(y_test,y_pred_w2v))

              precision    recall  f1-score   support

           0       0.95      0.60      0.73       149
           1       0.94      0.99      0.97       966

    accuracy                           0.94      1115
   macro avg       0.94      0.80      0.85      1115
weighted avg       0.94      0.94      0.94      1115

[[ 89  60]
 [  5 961]]
0.9417040358744395


# - **BOW : 98%**
# - **TF-IDF: 97%**
# - **Word2Vec : 94%**