In [None]:
from datasets import load_dataset

dataset = load_dataset("sms_spam")
print(dataset)

In [None]:
print(dataset['train'][8])

In [None]:
print(dataset['train'].features['label'])

#investigating TfidfVectorizer


In [None]:
text = [
    "I won the lottery",
    "you won a lottery",
    "Congratulations! we are happy to offer you SDE-1 role at Amazon"
]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 500)
X = vectorizer.fit_transform(text)

print("shappe of TF-IDF matrix:", X.shape)
print("Example of feature names:", vectorizer.get_feature_names_out()[:20])


Vectorizing the dataset! getting ready for that prediction

In [None]:
# doing the split with Hugging Face to avoid weird indexing issues
splits = dataset['train'].train_test_split(test_size=0.2, seed=42, stratify_by_column='label')

# pulling out the text and labels as plain lists (keeping it simple)
train_texts = list(splits['train']['sms'])
train_labels = list(map(int, splits['train']['label']))
test_texts  = list(splits['test']['sms'])
test_labels = list(map(int, splits['test']['label']))

# vectorizing with tf-idf — small cap on features to stay fast
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# fitting on train only, then applying to test
X_train = vectorizer.fit_transform(train_texts)
X_test  = vectorizer.transform(test_texts)

# sanity check — rows = docs, cols = vocab size
print("X_train shape:", X_train.shape)
print("X_test  shape:", X_test.shape)

# quick peek at what words made it in
print("Example features:", vectorizer.get_feature_names_out()[:20])


In [None]:
# training a simple baseline so I have a reference point
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
import matplotlib.pyplot as plt

# fitting the model on the tf-idf features
clf = LogisticRegression(max_iter=2000, n_jobs=-1)  # cranking up max_iter just in case
clf.fit(X_train, train_labels)

# getting predictions on the test set to see how well it generalizes
test_pred = clf.predict(X_test)

# quick metrics to get the vibe (accuracy + macro-F1 for imbalance)
acc = accuracy_score(test_labels, test_pred)
f1  = f1_score(test_labels, test_pred, average='macro')
print("test accuracy:", acc)
print("macro-F1:", f1)
print("\nclassification report:\n", classification_report(test_labels, test_pred, digits=3, target_names=["ham","spam"]))

In [None]:
# drawing the confusion matrix to see *how* it’s making mistakes
cm = confusion_matrix(test_labels, test_pred, labels=[0,1])  # 0=ham, 1=spam
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["ham","spam"])
disp.plot(values_format="d")
plt.title("TF-IDF + LogisticRegression — Confusion Matrix")
plt.show()