In [27]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [10]:
import numpy as np

In [5]:
import pandas as pd

In [6]:
df=pd.read_csv(r'C:\Users\ADMIN\Downloads\spam.csv',encoding='latin-1')
df=df[['v1','v2']]
df.columns=['category','text']

In [7]:
df['category'] = df['category'].map({'ham': 0, 'spam': 1})


In [6]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()             # Convert to lowercase
    text = text.split()             # Tokenize
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

df['processed_text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings(r"D:\glove.6B.50d.txt")  # Path to GloVe file

# Step 3: Compute Text Embeddings
def get_average_embedding(text, embeddings, dimension=50):
    tokens = text.split()
    valid_vectors = [embeddings[word] for word in tokens if word in embeddings]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)  # Average of word vectors
    else:
        return np.zeros(dimension)

embedding_features = np.array([get_average_embedding(t, glove_embeddings) for t in df['processed_text']])

def extract_additional_features(text):
    # Feature: Length of the message
    length = len(text)
    # Feature: Count of uppercase words
    uppercase_count = sum(1 for word in text.split() if word.isupper())
    # Feature: Presence of monetary terms
    has_money = int(bool(re.search(r'\$\d+', text)))
    has_urgent = int('urgent' in text.lower())
    return [length, uppercase_count, has_money, has_urgent]

additional_features = np.array([extract_additional_features(t) for t in df['text']])

# Combine embeddings and Additional Features
X_combined = np.hstack([embedding_features, additional_features])

#Feature shapes
print("Additional Features Shape:", additional_features.shape)
print("Combined Features Shape:", X_combined.shape)

Additional Features Shape: (5571, 4)
Combined Features Shape: (5571, 54)


In [9]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [15]:
from sklearn.model_selection import train_test_split
y=df['category']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9372197309417041
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96       959
           1       0.79      0.75      0.77       156

    accuracy                           0.94      1115
   macro avg       0.88      0.86      0.87      1115
weighted avg       0.94      0.94      0.94      1115



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
def predict_message_spam_status(message, model, embeddings, stop_words, stemmer, dimension=50):
    """
    Predict if a given message is spam or not.
    
    Parameters:
        message (str): The input message to classify.
        model (object): The trained Logistic Regression model.
        embeddings (dict): The GloVe word embeddings.
        stop_words (set): Set of stopwords to exclude during preprocessing.
        stemmer (PorterStemmer): Stemmer for preprocessing.
        dimension (int): Dimensionality of the GloVe vectors.
    
    Returns:
        str: "Spam" or "Not Spam"
    """
    # Preprocess the message
    processed_message = preprocess_text(message)
    
    # Compute the average embedding
    embedding_vector = get_average_embedding(processed_message, embeddings, dimension)
    
    # Extract additional features
    additional_features_vector = extract_additional_features(message)
    
    # Combine embeddings and additional features
    combined_features = np.hstack([embedding_vector, additional_features_vector])
    
    # Predict using the model
    prediction = model.predict([combined_features])[0]
    return "Spam" if prediction == 1 else "Not Spam"

# Example usage
input_message = input("Enter a message to classify as Spam or Not Spam: ")
result = predict_message_spam_status(input_message, model, glove_embeddings, stop_words, stemmer)
print(f"The message is classified as: {result}")


Enter a message to classify as Spam or Not Spam:   Thank you for paying last month’s bill. We’re rewarding our very best customers with a gift for their loyalty. Click here! [Link]


The message is classified as: Spam
