In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spam-collection/SMSSpamCollection


In [2]:
# Importing necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK stopwords
nltk.download('stopwords')

# Load the dataset
messages = pd.read_csv('/kaggle/input/spam-collection/SMSSpamCollection', sep='\t', names=["label", "message"])

# Data preprocessing
ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    # Stemming and removing stopwords
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

# Encoding the labels
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values  # Use [0] for 'ham' and [1] for 'spam'

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Model training using Multinomial Naive Bayes
spam_detect_model = MultinomialNB()

# Hyperparameter tuning using GridSearchCV
parameters = {'alpha': [0.5, 1.0, 1.5, 2.0]}  # Example hyperparameters
grid_search = GridSearchCV(estimator=spam_detect_model, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("Best Accuracy: {:.2f}%".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

# Predicting test results with the best model
y_pred = grid_search.predict(X_test)

# Evaluating the model
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", score)

print(classification_report(y_test, y_pred))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Best Accuracy: 98.23%
Best Parameters: {'alpha': 1.0}
Accuracy Score: 0.9856502242152466
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       955
        True       0.94      0.96      0.95       160

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



#### Average word 2 vec

In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import spacy
from gensim.utils import simple_preprocess
from tqdm import tqdm
import gensim

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load the dataset
messages = pd.read_csv('/kaggle/input/spam-collection/SMSSpamCollection', sep='\t', names=["label", "message"])

# Preprocess and create a corpus using spaCy for lemmatization
corpus = []
for i in tqdm(range(0, len(messages))):
    # Remove non-alphabetic characters and lowercase the text
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i]).lower()
    
    # Lemmatize words using spaCy, excluding stopwords
    doc = nlp(review)
    review = [token.lemma_ for token in doc if not token.is_stop]
    review = ' '.join(review)
    corpus.append(review)

# Tokenize sentences and preprocess using Gensim's simple_preprocess
words = [simple_preprocess(sent) for sent in corpus]

# Train a Word2Vec model
model = gensim.models.Word2Vec(sentences=words, vector_size=100, window=5, min_count=2, workers=4)

# Explore the model
print("Vocabulary size:", len(model.wv.index_to_key))
print("Most similar words to 'kid':", model.wv.similar_by_word('kid'))
print("Vector representation shape for 'kid':", model.wv['kid'].shape)

# Define a function to compute average Word2Vec for each document
def avg_word2vec(doc):
    # Remove out-of-vocabulary words and compute the mean vector
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

# Apply the avg_word2vec function to all sentences
X = [avg_word2vec(doc) for doc in tqdm(words)]

# Convert the list of vectors into a NumPy array for further processing
X_new = np.array(X, dtype=object)

# Check the shape of the resulting array
print("Shape of processed data:", X_new.shape)

100%|██████████| 5572/5572 [01:03<00:00, 88.03it/s]


Vocabulary size: 3160
Most similar words to 'kid': [('get', 0.996954619884491), ('buy', 0.9969309568405151), ('want', 0.9969201683998108), ('stay', 0.9969180226325989), ('say', 0.9969155788421631), ('special', 0.9968787431716919), ('yes', 0.9968639612197876), ('way', 0.9968328475952148), ('come', 0.9968308210372925), ('home', 0.9968228340148926)]
Vector representation shape for 'kid': (100,)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5572/5572 [00:01<00:00, 5493.90it/s]

Shape of processed data: (5572,)



