<a href="https://colab.research.google.com/github/shubhamksingh1/Clustering/blob/main/Bert/Clustering_Using_Bert_plus_KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
## Import libraries
from nltk.cluster import KMeansClusterer
import nltk
import numpy as np


In [None]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DeepAligned/Data/banking/dev.tsv",sep="\t")

In [None]:
df.shape

(1000, 2)

In [None]:
df.nunique()

text     1000
label      77
dtype: int64

In [None]:
df

Unnamed: 0,text,label
0,Could you please explain why I received a mess...,beneficiary_not_allowed
1,How did you guys get your exchange rate?,exchange_rate
2,I am still waiting on my card?,card_arrival
3,Why is the purchase I made this morning still ...,pending_card_payment
4,I made a mistake and need to cancel a transfer.,cancel_transfer
...,...,...
995,Is it possible to transfer money to my credit ...,topping_up_by_card
996,I've just been married and need to update my name,edit_personal_details
997,How does someone else top up my account,topping_up_by_card
998,Why is the payment from my card still pending?,pending_card_payment


In [None]:
# Utility function for generating sentence embedding from the text
def get_embeddinngs(text):
    return embedder.encode(text)

In [None]:
%%time
# Generating sentence embedding from the text
df['emb'] = df['text'].apply(get_embeddinngs)

CPU times: user 50.3 s, sys: 116 ms, total: 50.4 s
Wall time: 50.7 s


In [None]:
df

Unnamed: 0,text,label,emb
0,Could you please explain why I received a mess...,beneficiary_not_allowed,"[-0.4830286, -0.27165222, -0.42418325, 0.13459..."
1,How did you guys get your exchange rate?,exchange_rate,"[0.6504274, 0.6175015, -0.33170068, 0.331622, ..."
2,I am still waiting on my card?,card_arrival,"[-0.10394557, 0.5288769, -0.07015563, -0.51285..."
3,Why is the purchase I made this morning still ...,pending_card_payment,"[0.22972134, -0.11898315, -0.36682656, -0.3464..."
4,I made a mistake and need to cancel a transfer.,cancel_transfer,"[-0.14311454, -0.10506384, 0.07438872, -0.8786..."
...,...,...,...
995,Is it possible to transfer money to my credit ...,topping_up_by_card,"[0.14249371, 0.30876768, -0.12601331, -0.39804..."
996,I've just been married and need to update my name,edit_personal_details,"[0.284741, 0.638832, 0.49244645, -1.2638571, 0..."
997,How does someone else top up my account,topping_up_by_card,"[0.089588344, -0.07996903, -0.5829425, -0.5621..."
998,Why is the payment from my card still pending?,pending_card_payment,"[-0.22344202, 0.28667304, 0.058500335, -0.0574..."


In [None]:
def clustering_question(df,NUM_CLUSTERS = 77):

    sentences = df['text']

    X = np.array(df['emb'].tolist())

    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=25,avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

    df['cluster'] = pd.Series(assigned_clusters, index=df.index)
    df['centroid'] = df['cluster'].apply(lambda x: kclusterer.means()[x])

    return df, assigned_clusters

In [None]:
%%time
data,cluster_label = clustering_question(df)

CPU times: user 3min 37s, sys: 990 ms, total: 3min 38s
Wall time: 3min 38s


In [None]:
data

Unnamed: 0,text,label,emb,cluster,centroid
0,Could you please explain why I received a mess...,beneficiary_not_allowed,"[-0.4830286, -0.27165222, -0.42418325, 0.13459...",31,"[-0.07055746, -0.051681355, 0.3166883, -0.0734..."
1,How did you guys get your exchange rate?,exchange_rate,"[0.6504274, 0.6175015, -0.33170068, 0.331622, ...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977..."
2,I am still waiting on my card?,card_arrival,"[-0.10394557, 0.5288769, -0.07015563, -0.51285...",34,"[0.1423075, 0.109859206, 0.5488275, -0.6366570..."
3,Why is the purchase I made this morning still ...,pending_card_payment,"[0.22972134, -0.11898315, -0.36682656, -0.3464...",34,"[0.1423075, 0.109859206, 0.5488275, -0.6366570..."
4,I made a mistake and need to cancel a transfer.,cancel_transfer,"[-0.14311454, -0.10506384, 0.07438872, -0.8786...",22,"[-0.18066737, 0.04175015, 0.17891014, -0.67805..."
...,...,...,...,...,...
995,Is it possible to transfer money to my credit ...,topping_up_by_card,"[0.14249371, 0.30876768, -0.12601331, -0.39804...",24,"[-0.17252655, 0.35799894, 0.0001270771, 0.0303..."
996,I've just been married and need to update my name,edit_personal_details,"[0.284741, 0.638832, 0.49244645, -1.2638571, 0...",58,"[-0.20249534, -0.023389807, 0.20843203, -0.979..."
997,How does someone else top up my account,topping_up_by_card,"[0.089588344, -0.07996903, -0.5829425, -0.5621...",28,"[0.13908267, 0.32292715, -0.3593569, -0.536101..."
998,Why is the payment from my card still pending?,pending_card_payment,"[-0.22344202, 0.28667304, 0.058500335, -0.0574...",34,"[0.1423075, 0.109859206, 0.5488275, -0.6366570..."


In [None]:
from scipy.spatial import distance_matrix

In [None]:
def distance_from_centroid(row):
    return distance_matrix([row['emb']], [row['centroid']])[0][0]

In [None]:

# Compute centroid distance to the data
data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1)

In [None]:
data

Unnamed: 0,text,label,emb,cluster,centroid,distance_from_centroid
0,Could you please explain why I received a mess...,beneficiary_not_allowed,"[-0.4830286, -0.27165222, -0.42418325, 0.13459...",31,"[-0.07055746, -0.051681355, 0.3166883, -0.0734...",9.077115
1,How did you guys get your exchange rate?,exchange_rate,"[0.6504274, 0.6175015, -0.33170068, 0.331622, ...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",8.861092
2,I am still waiting on my card?,card_arrival,"[-0.10394557, 0.5288769, -0.07015563, -0.51285...",34,"[0.1423075, 0.109859206, 0.5488275, -0.6366570...",9.513056
3,Why is the purchase I made this morning still ...,pending_card_payment,"[0.22972134, -0.11898315, -0.36682656, -0.3464...",34,"[0.1423075, 0.109859206, 0.5488275, -0.6366570...",11.644768
4,I made a mistake and need to cancel a transfer.,cancel_transfer,"[-0.14311454, -0.10506384, 0.07438872, -0.8786...",22,"[-0.18066737, 0.04175015, 0.17891014, -0.67805...",6.793017
...,...,...,...,...,...,...
995,Is it possible to transfer money to my credit ...,topping_up_by_card,"[0.14249371, 0.30876768, -0.12601331, -0.39804...",24,"[-0.17252655, 0.35799894, 0.0001270771, 0.0303...",8.060166
996,I've just been married and need to update my name,edit_personal_details,"[0.284741, 0.638832, 0.49244645, -1.2638571, 0...",58,"[-0.20249534, -0.023389807, 0.20843203, -0.979...",11.521876
997,How does someone else top up my account,topping_up_by_card,"[0.089588344, -0.07996903, -0.5829425, -0.5621...",28,"[0.13908267, 0.32292715, -0.3593569, -0.536101...",8.807019
998,Why is the payment from my card still pending?,pending_card_payment,"[-0.22344202, 0.28667304, 0.058500335, -0.0574...",34,"[0.1423075, 0.109859206, 0.5488275, -0.6366570...",7.582754


In [None]:
data[data['distance_from_centroid']==0]

Unnamed: 0,text,label,emb,cluster,centroid,distance_from_centroid
857,My disposable virtual card was rejected.,virtual_card_not_working,"[-0.59289545, -0.2356106, 0.30464807, 0.237177...",25,"[-0.59289545, -0.2356106, 0.30464807, 0.237177...",0.0


In [None]:
data['cluster'].nunique()

77

In [None]:
data['cluster'].unique()

array([31,  0, 34, 22, 16, 67,  9, 20, 74,  6, 32, 49, 54, 35,  5, 72, 57,
       48,  1, 56, 66, 73, 19, 28, 26, 41, 64, 45, 37,  2, 51, 39, 11,  8,
       65, 23, 62, 40, 33, 50, 59, 60, 55, 47, 27, 44, 24, 63, 53, 52, 68,
       15, 36, 21, 75, 43, 69, 18, 29, 46, 17, 71, 42,  3, 30, 13, 61, 70,
       10, 76,  7, 14, 38, 12, 58, 25,  4])

In [None]:
data[data['cluster']==0]

Unnamed: 0,text,label,emb,cluster,centroid,distance_from_centroid
1,How did you guys get your exchange rate?,exchange_rate,"[0.6504274, 0.6175015, -0.33170068, 0.331622, ...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",8.861092
9,How do you decide what the exchange rates are?,exchange_rate,"[0.12365193, 0.42714843, 0.0011985229, 0.51165...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",7.076403
33,Where do you get those exchange rates?,exchange_rate,"[0.14660046, 0.65993863, 0.057953853, 0.360156...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",6.706015
143,Which flat currencies are supported for holdin...,fiat_currency_support,"[-0.40571186, -0.34666905, 0.19465913, -0.0853...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",10.703206
169,"Do you hold money and if so, what currencies d...",fiat_currency_support,"[0.17910741, 0.19703232, 0.27463356, 0.2849488...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",8.441095
218,How can I find your exchange rates?,exchange_rate,"[0.05562032, 0.76798046, -0.06389927, 0.344325...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",6.082565
233,Which currencies do you handle?,supported_cards_and_currencies,"[-0.17355224, 0.07736262, 0.13603133, 0.201582...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",8.250682
403,"I was wondering, Can I get my pay in another c...",receiving_money,"[0.111654185, 0.35625574, -0.3151897, -0.29285...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",8.89782
408,What is the foreign exchange rate you will apply?,exchange_rate,"[0.19464727, 0.55706096, 0.47889304, 0.3483716...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",8.726612
446,"When exchanging, how much are the rates?",exchange_rate,"[0.11082646, 0.22000127, 0.21032327, 1.1761034...",0,"[-0.00925918, 0.2807162, -0.12237299, 0.243977...",10.062797


In [None]:
res = data.sort_values('distance_from_centroid',ascending = True).groupby('cluster')

In [None]:
res.first()

Unnamed: 0_level_0,text,label,emb,centroid,distance_from_centroid
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,How can I find your exchange rates?,exchange_rate,"[0.05562032, 0.76798046, -0.06389927, 0.344325...","[-0.00925918, 0.2807162, -0.12237299, 0.243977...",6.082565
1,Where are the ATMs that I can use Mastercard?,atm_support,"[0.6710445, 0.47033647, 0.0877034, 0.13248093,...","[0.08871626, 0.54406023, 0.10083322, -0.134497...",5.955158
2,How can I exchange to EUR?,fiat_currency_support,"[-0.12618509, 0.63973844, 0.25445008, -0.94519...","[-0.019339679, -0.060871676, 0.5589173, -1.044...",7.307281
3,Which fiat currencies are supported by you?,fiat_currency_support,"[0.14031112, -0.17038345, 0.14025623, 0.748484...","[0.026631946, -0.15078142, 0.06607161, 0.67326...",4.411849
4,I wanted to take out $100 but I only got $20.,wrong_amount_of_cash_received,"[0.69798684, -0.15768874, 0.6523468, 0.4973138...","[0.55591995, -0.12315625, 0.5052437, 0.6804508...",3.408752
...,...,...,...,...,...
72,How do I delete my account now,terminate_account,"[-0.3464338, 0.14136748, -0.41264075, -0.40642...","[-0.0690281, 0.14912488, -0.15044056, -0.69068...",6.438104
73,I am seeing in the App a some cash withdrawal ...,cash_withdrawal_not_recognised,"[-0.2656738, 0.37610006, -0.38063765, -0.94580...","[-0.123678766, -0.02601047, -0.36307427, -0.35...",6.222218
74,What is the process for activating my card and...,activate_my_card,"[-0.2936064, 0.12593982, -0.1384906, -0.568097...","[-0.4041655, 0.18490705, -0.09548601, -0.31607...",6.751153
75,Where is my virtual card?,getting_virtual_card,"[-1.1014155, 0.44338357, 0.20208961, -0.218128...","[-0.71287525, 0.091823846, 0.08165267, -0.0626...",6.239634


In [None]:
for name,group in res:
    print(name)
    print(group[['text','label']])
    print(group[['distance_from_centroid']])

0
                                                  text                                    label
218                How can I find your exchange rates?                            exchange_rate
800  What are the charges if I exchange foreign cur...                          exchange_charge
33              Where do you get those exchange rates?                            exchange_rate
9       How do you decide what the exchange rates are?                            exchange_rate
534  What kind of charges should I expect to exchan...                          exchange_charge
664                   How does the exchange rate work?                            exchange_rate
233                    Which currencies do you handle?           supported_cards_and_currencies
169  Do you hold money and if so, what currencies d...                    fiat_currency_support
408  What is the foreign exchange rate you will apply?                            exchange_rate
645                          What exch

In [None]:
print(res.get_group(49)[['text','label']])

                                                  text                           label
281  Why did you charge me a fee when I made a tran...            transfer_fee_charged
181  I was charged a fee for the transfer, but can ...            transfer_fee_charged
102  Where did my extra fee for the transfer come f...            transfer_fee_charged
98         How much would I be charged for a transfer?  top_up_by_bank_transfer_charge
13   why was a extra fee added to my account for tr...            transfer_fee_charged
464  Why was I charged a random fee for a transfer ...            transfer_fee_charged
923  I noticed a charge for a transfer.  Why is thi...            transfer_fee_charged
927  There is a fee from a transfer that is not sup...            transfer_fee_charged
21         Will I be fined for topping up by transfer?  top_up_by_bank_transfer_charge
877  I don't understand why I was charged an additi...        card_payment_fee_charged
453             What is the handling fee fo

In [None]:
res.count()

Unnamed: 0_level_0,text,label,emb,centroid,distance_from_centroid
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,21,21,21,21,21
1,7,7,7,7,7
2,10,10,10,10,10
3,6,6,6,6,6
4,3,3,3,3,3
...,...,...,...,...,...
72,8,8,8,8,8
73,26,26,26,26,26
74,19,19,19,19,19
75,12,12,12,12,12
