In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings("ignore")


In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min = 1e-9)


def embeding_calc(sentences):

    # tokenizing
    encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = 'pt')

    # passing though model
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings



In [4]:
result = embeding_calc(['balanced sound','good sound'])

In [5]:
len(result)

2

In [6]:
score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))

In [7]:
score[0][0]

0.7417778

In [8]:
def similarity_calc(input1,input2):
    result = embeding_calc([input1,input2])
    score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))
    return score[0][0]

In [9]:
similarity_calc('balanced sound','poor bose')

0.16619003

# Data processing

In [10]:
df = pd.read_csv('/home/heptagon/Desktop/nps_review_analysis/dataset/BankPanacea_detail_review_topic.csv')

In [11]:
df.shape

(337, 4)

In [12]:
df.head()

Unnamed: 0,comments,topic,sentiment,type
0,Smooth easy The loan was exactly as described ...,"easy loan,hidden fee",NEGATIVE,appreciation
1,The company is doing an amazing job at how pro...,"amazing job,professional customer",POSITIVE,appreciation
2,Parker Beller went out of his way to accommoda...,"medical school,hectic schedule,available time",NEGATIVE,information
3,Know your customer instructions need update an...,"better clarity,financial transaction,slow proc...",POSITIVE,information
4,All customer happily receive all answer happil...,great life,POSITIVE,appreciation


In [13]:
all_keyword = []
for key_str in tqdm(df.topic):
#     print(key_str)
    splited_str = key_str.split(',')
    if len(splited_str)>0:
        all_keyword += splited_str
#     break

all_keyword

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 337/337 [00:00<00:00, 254589.42it/s]


['easy loan',
 'hidden fee',
 'amazing job',
 'professional customer',
 'medical school',
 'hectic schedule',
 'available time',
 'better clarity',
 'financial transaction',
 'slow process',
 'current balance',
 'long time',
 'welcome package',
 'postal mail',
 'last name',
 'closed account',
 'great life',
 'pleasant response',
 'wonderful answer',
 'faster parker',
 'professional offering',
 'entire process',
 'efficient service',
 'grateful help',
 'highest professionalism',
 'medical office',
 'high yield',
 'medical student',
 'amazed account',
 'online bank',
 'personal banker',
 'primary contact',
 'private student',
 'great option',
 'monthly payment',
 'better interest',
 'primary care',
 'recommend service',
 'private loan',
 'multiple loan',
 'primary care',
 'monthly payment',
 'lower interest',
 'best experience',
 'myriad company',
 'easy effortless',
 'excellent service',
 'medical field',
 'financial service',
 'early career',
 'easy hope',
 'good thing',
 'closed loan'

In [14]:
freq_count = {}
for key in all_keyword:
    if key in freq_count:
        freq_count[key] +=1
    else:
        freq_count[key] = 1

In [15]:
len(freq_count)

773

In [16]:
freq_count

{'easy loan': 2,
 'hidden fee': 2,
 'amazing job': 2,
 'professional customer': 1,
 'medical school': 21,
 'hectic schedule': 1,
 'available time': 3,
 'better clarity': 1,
 'financial transaction': 2,
 'slow process': 1,
 'current balance': 1,
 'long time': 2,
 'welcome package': 1,
 'postal mail': 1,
 'last name': 1,
 'closed account': 1,
 'great life': 1,
 'pleasant response': 1,
 'wonderful answer': 1,
 'faster parker': 1,
 'professional offering': 1,
 'entire process': 10,
 'efficient service': 1,
 'grateful help': 3,
 'highest professionalism': 1,
 'medical office': 1,
 'high yield': 1,
 'medical student': 30,
 'amazed account': 1,
 'online bank': 2,
 'personal banker': 24,
 'primary contact': 1,
 'private student': 3,
 'great option': 3,
 'monthly payment': 5,
 'better interest': 1,
 'primary care': 24,
 'recommend service': 1,
 'private loan': 4,
 'multiple loan': 1,
 'lower interest': 2,
 'best experience': 3,
 'myriad company': 1,
 'easy effortless': 1,
 'excellent service': 

In [17]:
unique_keywords = freq_count.keys()
unique_keywords

dict_keys(['easy loan', 'hidden fee', 'amazing job', 'professional customer', 'medical school', 'hectic schedule', 'available time', 'better clarity', 'financial transaction', 'slow process', 'current balance', 'long time', 'welcome package', 'postal mail', 'last name', 'closed account', 'great life', 'pleasant response', 'wonderful answer', 'faster parker', 'professional offering', 'entire process', 'efficient service', 'grateful help', 'highest professionalism', 'medical office', 'high yield', 'medical student', 'amazed account', 'online bank', 'personal banker', 'primary contact', 'private student', 'great option', 'monthly payment', 'better interest', 'primary care', 'recommend service', 'private loan', 'multiple loan', 'lower interest', 'best experience', 'myriad company', 'easy effortless', 'excellent service', 'medical field', 'financial service', 'early career', 'easy hope', 'good thing', 'closed loan', 'amazing customer', 'great bank', 'dental office', 'polite people', 'smooth

In [18]:
clustures = []

used_keywords = []

for i in tqdm(unique_keywords):
    current_custer = [i]
    
    if (i in used_keywords):
        continue
    
    used_keywords.append(i)
    
    
    for j in unique_keywords:
        if  (j in used_keywords):
            continue
    
        current_score = similarity_calc(i,j)
        if current_score>0.7:
            current_custer.append(j)
            used_keywords.append(j)
            
    
    clustures.append(current_custer)
        
#     break

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 773/773 [31:27<00:00,  2.44s/it]


In [19]:
len(clustures)

358

In [20]:
clustures[:10]

[['easy loan',
  'private loan',
  'helpful loan',
  'personal loan',
  'quick loan',
  'entire loan',
  'future loan',
  'many loan',
  'extra loan',
  'responsive loan',
  'satisfied loan',
  'open loan',
  'pleasant loan',
  'easy banking',
  'medium loan',
  'awesome loan',
  'better loan',
  'simple loan',
  'individual loan',
  'transparent loan',
  'lower loan',
  'best loan',
  'easiest loan'],
 ['hidden fee', 'unexpected expense'],
 ['amazing job',
  'wonderful work',
  'amazing experience',
  'excellent experience',
  'great job',
  'amazing service',
  'great work',
  'great experience',
  'incredible experience',
  'fantastic work',
  'incredible job',
  'remarkable experience',
  'great process',
  'pleasant work',
  'nice work',
  'amazing work',
  'awesome experience',
  'brilliant fact'],
 ['professional customer',
  'professional offering',
  'amazing customer',
  'great customer',
  'professional service',
  'wonderful customer',
  'excellent customer',
  'proffesiona

In [42]:
final_result = []
for cp in clustures:
    cluster_key_count = {}
    for key_c in cp:
        cluster_key_count[key_c] = freq_count[key_c]
    
    items = []
    for i in list(tuple(cluster_key_count.keys())):
        items.append(i.split()[0])
    dict1 = {}
    for item in items:
        if not item in dict1:
            dict1[item] = items.count(item)
    dict1 = sorted(dict1.keys(), reverse=True)[0]

    curr_cul_keys = list(cluster_key_count.keys())
    curr_cul_values = list(cluster_key_count.values())
    sum_of_similar_words = np.sum(curr_cul_values)
    highest_count_key = curr_cul_keys[np.argmax(curr_cul_values)]
    highest_count_value = np.max(curr_cul_values)
    final_result.append([cluster_key_count,sum_of_similar_words,highest_count_key,len(cluster_key_count), dict1])
    
#     break
        

In [43]:
final_result

[[{'easy loan': 2,
   'private loan': 4,
   'helpful loan': 1,
   'personal loan': 22,
   'quick loan': 1,
   'entire loan': 2,
   'future loan': 2,
   'many loan': 1,
   'extra loan': 1,
   'responsive loan': 1,
   'satisfied loan': 1,
   'open loan': 1,
   'pleasant loan': 1,
   'easy banking': 1,
   'medium loan': 1,
   'awesome loan': 1,
   'better loan': 1,
   'simple loan': 1,
   'individual loan': 1,
   'transparent loan': 1,
   'lower loan': 1,
   'best loan': 1,
   'easiest loan': 1},
  50,
  'personal loan',
  23,
  'transparent'],
 [{'hidden fee': 2, 'unexpected expense': 2},
  4,
  'hidden fee',
  2,
  'unexpected'],
 [{'amazing job': 2,
   'wonderful work': 2,
   'amazing experience': 5,
   'excellent experience': 12,
   'great job': 2,
   'amazing service': 5,
   'great work': 4,
   'great experience': 26,
   'incredible experience': 1,
   'fantastic work': 1,
   'incredible job': 1,
   'remarkable experience': 1,
   'great process': 1,
   'pleasant work': 1,
   'nice wor

In [44]:
df = pd.DataFrame(final_result,columns=["similar_words_and_count","count_of_similar_words","most_used_similar_word","total_similar_words", "associated_words"])

In [45]:
sorted_df = df.sort_values('total_similar_words',ascending=False).reset_index(drop=True)

In [46]:
sorted_df

Unnamed: 0,similar_words_and_count,count_of_similar_words,most_used_similar_word,total_similar_words,associated_words
0,"{'easy loan': 2, 'private loan': 4, 'helpful l...",50,personal loan,23,transparent
1,"{'financial transaction': 2, 'financial servic...",33,financial situation,23,whole
2,"{'amazing job': 2, 'wonderful work': 2, 'amazi...",69,great experience,18,wonderful
3,"{'professional customer': 1, 'professional off...",39,excellent customer,15,wonderful
4,"{'efficient service': 1, 'excellent service': ...",23,excellent service,14,wonderful
...,...,...,...,...,...
353,{'early interest': 1},1,early interest,1,early
354,{'nice change': 1},1,nice change,1,nice
355,{'online bot': 1},1,online bot,1,online
356,{'efficient keeping': 1},1,efficient keeping,1,efficient


In [50]:
sorted_df.to_csv("/home/heptagon/Desktop/nps_review_analysis/dataset/BankPanacea_detail_associated_words.csv", index=False)

In [48]:
print((sorted_df.most_used_similar_word).to_list()[:20])

['personal loan', 'financial situation', 'great experience', 'excellent customer', 'excellent service', 'grateful help', 'medical student', 'excellent communication', 'good thing', 'personal banker', 'online bank', 'wonderful experience', 'easy application', 'hard credit', 'whole process', 'recommend panacea', 'financial need', 'financial institution', 'efficient process', 'high interest']


In [49]:
print((sorted_df.associated_words).to_list()[:20])

['transparent', 'whole', 'wonderful', 'wonderful', 'wonderful', 'willing', 'young', 'unparalleled', 'true', 'straightforward', 'regular', 'wonderful', 'straightforward', 'tremendous', 'whole', 'recommend', 'financial', 'financial', 'tedious', 'highest']


In [52]:
# Problem Statement : Now can you find out the keywords or expressions related to these keywords
#   Ex: Battery: Good, bad, slow charging etc, for wach keyword

# we are getting list of keywords so
# 1. currently taking 1st most word from the keywords
# 2. Thinking : To apply a specific Algo on entire datasets