In [75]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extract_to = "sms_spam_collection"
new_file_path = Path(extract_to) / "SMSSpamCollection.tsv"

def download_and_unzip(url, zip_path, extract_to, new_file_path):
    # Check if the target file already exists
    if new_file_path.exists():
        print(f"{new_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    # Renaming the file to indicate its format
    original_file = Path(extract_to) / "SMSSpamCollection"
    os.rename(original_file, new_file_path)
    print(f"File download and saved as {new_file_path}")

# Execute the function
download_and_unzip(url, zip_path, extract_to, new_file_path)

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [76]:
import pandas as pd

df = pd.read_csv(new_file_path, sep="\t", header=None, names=["Label", "Text"])
df = df.sample(frac=1, random_state=123).reset_index(drop=True)  # Shuffle the DataFrame
df

Unnamed: 0,Label,Text
0,ham,Aight text me when you're back at mu and I'll ...
1,ham,Our Prashanthettan's mother passed away last n...
2,ham,No it will reach by 9 only. She telling she wi...
3,ham,Do you know when the result.
4,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...
...,...,...
5567,ham,I accidentally brought em home in the box
5568,spam,Moby Pub Quiz.Win a £100 High Street prize if ...
5569,ham,Que pases un buen tiempo or something like that
5570,ham,Nowadays people are notixiquating the laxinorf...


In [77]:
# Class distribution
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [78]:
# Count the instances of 'spam'
n_spam = df[df["Label"] == "spam"].shape[0]

# Randomly sample 'ham' instances to match the number of 'spam' instances
ham_sampled = df[df["Label"] == "ham"].sample(n_spam)

# Combine the sampled 'ham' with all 'spam'
balanced_df = pd.concat([ham_sampled, df[df["Label"] == "spam"]])

# Shuffle the DataFrame
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

# Now balanced_df is the balanced DataFrame
print(balanced_df["Label"].value_counts())

Label
spam    747
ham     747
Name: count, dtype: int64


In [79]:
df["Label"] = df["Label"].map({"ham": 0, "spam": 1})

In [80]:
# Define split ratios
train_size, validation_size = 0.7, 0.1
# Test size is implied to be 0.2 as the remainder

# Split the data
def stratified_split(df, stratify_col, train_frac, validation_frac):
    stratified_train = pd.DataFrame()
    stratified_validation = pd.DataFrame()
    stratified_test = pd.DataFrame()

    # Stratify split by the unique values in the column
    for value in df[stratify_col].unique():
        # Filter the DataFrame for the class
        df_class = df[df[stratify_col] == value]
        
        # Calculate class split sizes
        train_end = int(len(df_class) * train_frac)
        validation_end = train_end + int(len(df_class) * validation_frac)
        
        # Slice the DataFrame to get the sets
        stratified_train = pd.concat([stratified_train, df_class[:train_end]], axis=0)
        stratified_validation = pd.concat([stratified_validation, df_class[train_end:validation_end]], axis=0)
        stratified_test = pd.concat([stratified_test, df_class[validation_end:]], axis=0)

    # Shuffle the sets again
    stratified_train = stratified_train.sample(frac=1, random_state=123).reset_index(drop=True)
    stratified_validation = stratified_validation.sample(frac=1, random_state=123).reset_index(drop=True)
    stratified_test = stratified_test.sample(frac=1, random_state=123).reset_index(drop=True)

    return stratified_train, stratified_validation, stratified_test

# Apply the stratified split function
train_df, validation_df, test_df = stratified_split(df, "Label", train_size, validation_size)

# Check the results
print(f"Training set:\n{train_df['Label'].value_counts(normalize=True)}")
print(f"\nValidation set:\n{validation_df['Label'].value_counts(normalize=True)}")
print(f"\nTest set:\n{test_df['Label'].value_counts(normalize=True)}")

Training set:
Label
0    0.86612
1    0.13388
Name: proportion, dtype: float64

Validation set:
Label
0    0.866906
1    0.133094
Name: proportion, dtype: float64

Test set:
Label
0    0.864816
1    0.135184
Name: proportion, dtype: float64


In [81]:
# Define split ratios
train_size, validation_size = 0.7, 0.1
# Test size is implied to be 0.2 as the remainder

# Apply the stratified split function
train_df, validation_df, test_df = stratified_split(balanced_df, "Label", train_size, validation_size)

# Check the results
print(f"Training set:\n{train_df['Label'].value_counts(normalize=True)}")
print(f"\nValidation set:\n{validation_df['Label'].value_counts(normalize=True)}")
print(f"\nTest set:\n{test_df['Label'].value_counts(normalize=True)}")

Training set:
Label
ham     0.5
spam    0.5
Name: proportion, dtype: float64

Validation set:
Label
ham     0.5
spam    0.5
Name: proportion, dtype: float64

Test set:
Label
spam    0.5
ham     0.5
Name: proportion, dtype: float64


## Scikit-learn baseline

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [83]:
vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_df["Text"])
X_val = vectorizer.transform(validation_df["Text"])
X_test = vectorizer.transform(test_df["Text"])

y_train, y_val, y_test = train_df["Label"], validation_df["Label"], test_df["Label"]

In [84]:
def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):
    # Making predictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    
    # Calculating accuracy and balanced accuracy
    accuracy_train = accuracy_score(y_train, y_pred_train)
    balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    
    accuracy_val = accuracy_score(y_val, y_pred_val)
    balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)

    accuracy_test = accuracy_score(y_test, y_pred_test)
    balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)
    
    # Printing the results
    print(f"Training Accuracy: {accuracy_train*100:.2f}%")
    print(f"Validation Accuracy: {accuracy_val*100:.2f}%")
    print(f"Test Accuracy: {accuracy_test*100:.2f}%")
    
    print(f"\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%")
    print(f"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%")
    print(f"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%")

In [85]:
from sklearn.dummy import DummyClassifier

# Create a dummy classifier with the strategy to predict the most frequent class
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)

Training Accuracy: 50.00%
Validation Accuracy: 50.00%
Test Accuracy: 50.00%

Training Balanced Accuracy: 50.00%
Validation Balanced Accuracy: 50.00%
Test Balanced Accuracy: 50.00%


In [86]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
eval(model, X_train, y_train, X_val, y_val, X_test, y_test)

Training Accuracy: 99.81%
Validation Accuracy: 95.27%
Test Accuracy: 96.03%

Training Balanced Accuracy: 99.81%
Validation Balanced Accuracy: 95.27%
Test Balanced Accuracy: 96.03%
