In [5]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("Spam_SMS.csv", names=["label", "message"], sep=",", header=None)

# Convert labels to binary (ham: 0, spam: 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Drop any null values
df.dropna(inplace=True)

# Text Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

df['message'] = df['message'].apply(preprocess_text)

# Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to reduce overfitting
X = vectorizer.fit_transform(df['message'])
y = df['label']

# Train-Test Split (Stratified to handle class imbalance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train Random Forest Model
model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shantanubasumatary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.98
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       966
         1.0       1.00      0.83      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [12]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("Spam_SMS.csv", names=["label", "message"], sep=",", header=None)

# Convert labels to binary (ham: 0, spam: 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Drop any null values
df.dropna(inplace=True)

# Text Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

df['message'] = df['message'].apply(preprocess_text)

# Feature Extraction (TF-IDF with improved n-grams)
vectorizer = TfidfVectorizer(max_features=7000, ngram_range=(1, 4))  # Capture more phrase structures
X = vectorizer.fit_transform(df['message'])
y = df['label']

# Train-Test Split (Stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Hyperparameter tuning for Random Forest
rf_params = {'n_estimators': [200, 300, 400], 'max_depth': [20, 25, 30]}
rf_grid = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42), rf_params, cv=3)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Hyperparameter tuning for Naïve Bayes
nb_params = {'alpha': [0.01, 0.05, 0.1, 0.5, 1]}
nb_grid = GridSearchCV(MultinomialNB(), nb_params, cv=3)
nb_grid.fit(X_train, y_train)
best_nb = nb_grid.best_estimator_

# Stacking Model
base_models = [('rf', best_rf), ('nb', best_nb)]
meta_model = LogisticRegression(class_weight={0: 1, 1: 2})  # Give more weight to spam

stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, passthrough=True)
stacking_model.fit(X_train, y_train)

# Predictions
y_pred = stacking_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shantanubasumatary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.98
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       966
         1.0       0.98      0.90      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

