In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings("ignore")


In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min = 1e-9)


def embeding_calc(sentences):

    # tokenizing
    encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = 'pt')

    # passing though model
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings



In [4]:
result = embeding_calc(['balanced sound','good sound'])

In [5]:
len(result)

2

In [6]:
score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))

In [7]:
score[0][0]

0.7417778

In [8]:
def similarity_calc(input1,input2):
    result = embeding_calc([input1,input2])
    score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))
    return score[0][0]

In [9]:
similarity_calc('balanced sound','poor bose')

0.16619003

# Data processing

In [10]:
df = pd.read_csv('/home/heptagon/Desktop/nps_review_analysis/bank_detail_review_sentiments.csv')

In [11]:
df.shape

(1717, 4)

In [12]:
df.head()

Unnamed: 0,comments,topic,sentiment,type
0,I truly enjoyed Ms Jennifer at the Jasper bran...,"special lady,different account",POSITIVE,complaint
1,Nice service and I can testify they have one o...,"nice service,best staff,online bank",POSITIVE,appreciation
2,The staff there are always helpful and willing...,prompt decision,POSITIVE,information
3,Great bank very good customer service In and o...,good customer,POSITIVE,appreciation
4,The Nederland branch is the best I am always t...,"important person,best interest",POSITIVE,appreciation


In [13]:
all_keyword = []
for key_str in tqdm(df.topic):
#     print(key_str)
    splited_str = key_str.split(',')
    if len(splited_str)>0:
        all_keyword += splited_str
#     break

all_keyword

100%|███████████████████████████████████| 1717/1717 [00:00<00:00, 377482.96it/s]


['special lady',
 'different account',
 'nice service',
 'best staff',
 'online bank',
 'prompt decision',
 'good customer',
 'important person',
 'best interest',
 'wonderful relief',
 'wonderful reminder',
 'sweet lady',
 'courteous time',
 'helpful thank',
 'sincere gratitude',
 'special time',
 'fond holiday',
 'preferred location',
 'great agent',
 'excellent customer',
 'grateful assistance',
 'great staff',
 'easy opening',
 'great company',
 'wiht customer',
 'ready sign',
 'easy talk',
 'friendly point',
 'pleased timeliness',
 'little direction',
 'smooth process',
 'prompt customer',
 'financial need',
 'excellent rate',
 'great experience',
 'new customer',
 'last month',
 'easy work',
 'fast response',
 'fast releasing',
 'interested daughter',
 'enough help',
 'many year',
 'friendly make',
 'overall experience',
 'pleased customer',
 'short notice',
 'smooth hassle',
 'personal loan',
 'dugood bank',
 'low interest',
 'fast sign',
 'free pleasure',
 'tara baloney',
 'fav

In [14]:
freq_count = {}
for key in all_keyword:
    if key in freq_count:
        freq_count[key] +=1
    else:
        freq_count[key] = 1

In [15]:
len(freq_count)

1933

In [16]:
freq_count

{'special lady': 1,
 'different account': 6,
 'nice service': 1,
 'best staff': 2,
 'online bank': 1,
 'prompt decision': 1,
 'good customer': 10,
 'important person': 1,
 'best interest': 4,
 'wonderful relief': 1,
 'wonderful reminder': 1,
 'sweet lady': 1,
 'courteous time': 1,
 'helpful thank': 6,
 'sincere gratitude': 1,
 'special time': 2,
 'fond holiday': 1,
 'preferred location': 1,
 'great agent': 1,
 'excellent customer': 20,
 'grateful assistance': 1,
 'great staff': 6,
 'easy opening': 2,
 'great company': 3,
 'wiht customer': 1,
 'ready sign': 4,
 'easy talk': 3,
 'friendly point': 1,
 'pleased timeliness': 1,
 'little direction': 1,
 'smooth process': 5,
 'prompt customer': 1,
 'financial need': 12,
 'excellent rate': 2,
 'great experience': 63,
 'new customer': 4,
 'last month': 2,
 'easy work': 27,
 'fast response': 3,
 'fast releasing': 1,
 'interested daughter': 1,
 'enough help': 1,
 'many year': 29,
 'friendly make': 1,
 'overall experience': 4,
 'pleased customer':

In [17]:
unique_keywords = freq_count.keys()
unique_keywords

dict_keys(['special lady', 'different account', 'nice service', 'best staff', 'online bank', 'prompt decision', 'good customer', 'important person', 'best interest', 'wonderful relief', 'wonderful reminder', 'sweet lady', 'courteous time', 'helpful thank', 'sincere gratitude', 'special time', 'fond holiday', 'preferred location', 'great agent', 'excellent customer', 'grateful assistance', 'great staff', 'easy opening', 'great company', 'wiht customer', 'ready sign', 'easy talk', 'friendly point', 'pleased timeliness', 'little direction', 'smooth process', 'prompt customer', 'financial need', 'excellent rate', 'great experience', 'new customer', 'last month', 'easy work', 'fast response', 'fast releasing', 'interested daughter', 'enough help', 'many year', 'friendly make', 'overall experience', 'pleased customer', 'short notice', 'smooth hassle', 'personal loan', 'dugood bank', 'low interest', 'fast sign', 'free pleasure', 'tara baloney', 'favorite place', 'happy holiday', 'welcome atmu

In [None]:
clustures = []

used_keywords = []

for i in tqdm(unique_keywords):
    current_custer = [i]
    
    if (i in used_keywords):
        continue
    
    used_keywords.append(i)
    
    
    for j in unique_keywords:
        if  (j in used_keywords):
            continue
    
        current_score = similarity_calc(i,j)
        if current_score>0.7:
            current_custer.append(j)
            used_keywords.append(j)
            
    
    clustures.append(current_custer)
        
#     break

  1%|▎                                     | 14/1933 [07:48<14:36:24, 27.40s/it]

In [None]:
len(clustures)

In [None]:
clustures[:10]

In [None]:
final_result = []
for cp in clustures:
    cluster_key_count = {}
    for key_c in cp:
        cluster_key_count[key_c] = freq_count[key_c]
    
    items = []
    for i in list(tuple(cluster_key_count.keys())):
        items.append(i.split()[0])
    dict1 = {}
    for item in items:
        if not item in dict1:
            dict1[item] = items.count(item)
    dict1 = sorted(dict1.keys(), reverse=True)[0]

    curr_cul_keys = list(cluster_key_count.keys())
    curr_cul_values = list(cluster_key_count.values())
    sum_of_similar_words = np.sum(curr_cul_values)
    highest_count_key = curr_cul_keys[np.argmax(curr_cul_values)]
    highest_count_value = np.max(curr_cul_values)
    final_result.append([cluster_key_count,sum_of_similar_words,highest_count_key,len(cluster_key_count), dict1])
    
#     break
        

In [None]:
final_result

In [None]:
df = pd.DataFrame(final_result,columns=["similar_words_and_count","count_of_similar_words","most_used_similar_word","total_similar_words", "key_expressions"])

In [None]:
sorted_df = df.sort_values('total_similar_words',ascending=False).reset_index(drop=True)

In [None]:
sorted_df

In [None]:
sorted_df.to_csv("/home/heptagon/Desktop/nps_review_analysis/most_keywords.csv", index=False)

In [None]:
print((sorted_df.most_used_similar_word).to_list()[:20])