In [None]:
pip install pandas scikit-learn sentence-transformers
# installing  required libraries



In [None]:
# loading the dataset
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep="\t", header=None, names=["label", "text"])

print(df.head())
print("Total messages:", len(df))


  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Total messages: 5572


In [None]:
#Create a balanced dataset (better accuracy)
df_small = pd.concat([
    df[df["label"] == "ham"].head(60),
    df[df["label"] == "spam"].head(60)
]).reset_index(drop=True)

print(df_small["label"].value_counts())


label
ham     60
spam    60
Name: count, dtype: int64


In [None]:
#Split dataset into more than 10 chunks

chunks = []
chunk_size = 10

for i in range(0, len(df_small), chunk_size):
    chunk_text = " ".join(df_small["text"][i:i+chunk_size])
    chunks.append(chunk_text)

print("Number of chunks:", len(chunks))  # 12


Number of chunks: 12


In [None]:
import pandas as pd

# Load dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep="\t", header=None, names=["label", "text"])

# Take small balanced dataset (120 messages)
df_small = pd.concat([
    df[df["label"] == "ham"].head(60),
    df[df["label"] == "spam"].head(60)
]).reset_index(drop=True)

# Create chunks
chunks = []
chunk_size = 10

for i in range(0, len(df_small), chunk_size):
    chunk_text = " ".join(df_small["text"][i:i+chunk_size])
    chunks.append(chunk_text)

# Print ONE line per chunk
print("Total chunks:", len(chunks))
print()

for i, chunk in enumerate(chunks):
    first_line = chunk.split(".")[0]
    print(f"Chunk {i+1}: {first_line}")


Total chunks: 12

Chunk 1: Go until jurong point, crazy
Chunk 2: Eh u remember how 2 spell his name
Chunk 3: I'm back &amp; we're packing the car now, I'll let you know if there's room Ahhh
Chunk 4: Hello! How's you and how did saturday go? I was just texting to see if you'd decided to do anything tomo
Chunk 5: What you thinked about me
Chunk 6: Its a part of checking IQ Sorry my roommates took forever, it ok if I come by now? Ok lar i double check wif da hair dresser already he said wun cut v short
Chunk 7: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005
Chunk 8: SMS
Chunk 9: URGENT! Your Mobile No
Chunk 10: URGENT! We are trying to contact you
Chunk 11: Ur ringtone service has changed! 25 Free credits! Go to club4mobiles
Chunk 12: Fancy a shag? I do


In [None]:
#Generate labels for each chunk
labels = []

for i in range(0, len(df_small), chunk_size):
    part = df_small["label"][i:i+chunk_size]
    spam_count = (part == "spam").sum()
    ham_count = (part == "ham").sum()
    labels.append(1 if spam_count > ham_count else 0)

print("Chunk labels:", labels)


Chunk labels: [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


In [None]:
# Generate embeddings
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

print("Embedding shape:", embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding shape: (12, 384)


In [None]:
#Normalize embeddings
from sklearn.preprocessing import normalize

X = normalize(embeddings)


In [None]:
#Train–test split
import numpy as np

y = np.array(labels)

X_train = X[:10]
y_train = y[:10]

X_test = X[10:]
y_test = y[10:]

print("Test labels:", y_test)


Test labels: [1 1]


In [None]:
#Apply KNN with cosine similarity
from sklearn.neighbors import KNeighborsClassifier

for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k, metric="cosine")
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    acc = knn.score(X_test, y_test)
    print(f"k = {k}, Predictions = {preds}, Accuracy = {acc}")


k = 1, Predictions = [1 1], Accuracy = 1.0
k = 2, Predictions = [1 1], Accuracy = 1.0
k = 3, Predictions = [1 1], Accuracy = 1.0
k = 4, Predictions = [1 1], Accuracy = 1.0
k = 5, Predictions = [1 1], Accuracy = 1.0
k = 6, Predictions = [1 1], Accuracy = 1.0
k = 7, Predictions = [1 1], Accuracy = 1.0
k = 8, Predictions = [0 0], Accuracy = 0.0
k = 9, Predictions = [0 0], Accuracy = 0.0
k = 10, Predictions = [0 0], Accuracy = 0.0


In [None]:
#HAM (0) → Normal, personal, genuine messages

#SPAM (1) → Promotional, prize, lottery, offer messages

In [None]:
#“Chunks 1 to 6 contain HAM messages, and chunks 7 to 12 contain SPAM messages.”