In [None]:
!pip install urllib3
import pandas as pd
import torch
import evaluate
import accelerate
from transformers import TextClassificationPipeline,AutoModelForSequenceClassification, TrainingArguments,Trainer
import numpy as np
from datasets import load_metric, Dataset


In [91]:
# data from https://www.kaggle.com/datasets/balaka18/email-spam-classification-dataset-csv
# create a label column with 0 for ham and 1 for spam
# 
file_path="/Users/raphaelderbier/Documents/spam.csv"

df = pd.read_csv(file_path,encoding = "ISO-8859-1")

df['labels'] = df.Category.map({'ham':0, 'spam':1})

df.rename(columns = {'Message':'content'}, inplace = True)

**text**

In [94]:

# create embeddings of the content
# 
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [95]:
sentences=df['content'].values
embeddings = model.encode(sentences, convert_to_tensor=True)
df['embeddings']=embeddings.tolist()


In [96]:
df

Unnamed: 0,Category,content,labels,embeddings
0,ham,"Go until jurong point, crazy.. Available only ...",0,"[-0.016918141394853592, -0.038167908787727356,..."
1,ham,Ok lar... Joking wif u oni...,0,"[-0.013368861749768257, -0.049869999289512634,..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[-0.01543346606194973, 0.06304138898849487, 0...."
3,ham,U dun say so early hor... U c already then say...,0,"[-0.012307888828217983, 0.037198394536972046, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"[0.07770048826932907, -0.13287170231342316, 0...."
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,"[-0.08200773596763611, -0.011610500514507294, ..."
5568,ham,Will Ã¼ b going to esplanade fr home?,0,"[0.016426127403974533, -0.05299913510680199, 0..."
5569,ham,"Pity, * was in mood for that. So...any other s...",0,"[-0.06416906416416168, 0.02013995684683323, 0...."
5570,ham,The guy did some bitching but I acted like i'd...,0,"[-0.06773331016302109, 0.03347700461745262, -0..."


In [111]:
# Perform kmean clustering
from sklearn.cluster import KMeans
num_clusters = 200
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
spam_by_cluster = [[0,0] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(df['Category'][sentence_id]+": "+df['content'][sentence_id])
    spam_by_cluster[cluster_id][df['labels'][sentence_id]] += 1

for i, cluster in enumerate(clustered_sentences):
    if spam_by_cluster[i][0] != 0 and spam_by_cluster[i][1] != 0:
        print()
        print("Cluster ", i+1)
        print(spam_by_cluster[i])
        for j, text in enumerate(cluster):
            print(text)

  super()._check_params_vs_input(X, default_n_init=10)



Cluster  4
[22, 2]
ham: I'm sorry. I've joined the league of people that dont keep in touch. You mean a great deal to me. You have been a friend at all times even at great personal cost. Do have a great week.|
ham: Lol yes. Our friendship is hanging on a thread cause u won't buy stuff.
ham: Not really dude, have no friends i'm afraid :(
ham: I met you as a stranger and choose you as my friend. As long as the world stands, our friendship never ends. Lets be Friends forever!!! Gud nitz...
ham: Sorry light turned green, I meant another friend wanted  &lt;#&gt;  worth but he may not be around
ham: Talk With Yourself Atleast Once In A Day...!!! Otherwise You Will Miss Your Best FRIEND In This WORLD...!!! -Shakespeare- SHESIL  &lt;#&gt;
ham: Donât give a flying monkeys wot they think and I certainly donât mind. Any friend of mine and all that!
ham: Friendship is not a game to play, It is not a word to say, It doesn\'t start on March and ends on May, It is tomorrow, yesterday, today and 

In [105]:
spam_by_cluster

[[55, 1],
 [2, 72],
 [48, 1],
 [59, 17],
 [0, 11],
 [50, 0],
 [40, 15],
 [44, 0],
 [3, 62],
 [67, 8],
 [93, 0],
 [38, 0],
 [31, 0],
 [91, 0],
 [35, 0],
 [68, 1],
 [2, 24],
 [60, 0],
 [89, 1],
 [24, 42],
 [58, 8],
 [53, 2],
 [48, 0],
 [51, 0],
 [72, 0],
 [48, 0],
 [110, 1],
 [67, 0],
 [50, 0],
 [24, 0],
 [66, 1],
 [75, 1],
 [54, 0],
 [6, 29],
 [62, 0],
 [74, 0],
 [30, 3],
 [30, 0],
 [39, 0],
 [26, 0],
 [0, 38],
 [0, 75],
 [55, 3],
 [81, 1],
 [44, 0],
 [16, 34],
 [20, 0],
 [66, 0],
 [136, 0],
 [24, 0],
 [85, 2],
 [53, 0],
 [33, 1],
 [31, 0],
 [7, 48],
 [86, 0],
 [37, 0],
 [64, 3],
 [25, 1],
 [79, 0],
 [72, 1],
 [31, 0],
 [47, 1],
 [78, 0],
 [47, 3],
 [77, 1],
 [38, 0],
 [33, 0],
 [69, 0],
 [37, 0],
 [76, 3],
 [57, 3],
 [0, 60],
 [29, 15],
 [81, 4],
 [30, 0],
 [0, 98],
 [57, 0],
 [83, 0],
 [27, 1],
 [96, 0],
 [0, 17],
 [54, 1],
 [62, 0],
 [54, 2],
 [66, 1],
 [48, 1],
 [41, 0],
 [38, 1],
 [42, 6],
 [22, 0],
 [46, 0],
 [9, 0],
 [113, 0],
 [28, 21],
 [49, 0],
 [65, 1],
 [54, 0],
 [43, 0],
 [

In [99]:
# try UMAP for dimension reduction 
# Note Rapahel - not working yet due to UMAP not found ?
import umap
reducer = umap.UMAP()
reduced = reducer.fit_transform(embeddings.tolist())
reduced.shape