In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings("ignore")


In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min = 1e-9)


def embeding_calc(sentences):

    # tokenizing
    encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = 'pt')

    # passing though model
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings



In [4]:
result = embeding_calc(['balanced sound','good sound'])

In [5]:
len(result)

2

In [6]:
score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))

In [7]:
score[0][0]

0.7417778

In [8]:
def similarity_calc(input1,input2):
    result = embeding_calc([input1,input2])
    score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))
    return score[0][0]

In [9]:
similarity_calc('balanced sound','poor bose')

0.16619003

# Data processing

In [10]:
df = pd.read_csv('/home/heptagon/Desktop/nps_review_analysis/key_phrases_new.csv')

In [11]:
df.shape

(93, 4)

In [12]:
df.head()

Unnamed: 0,comments,topic,type,sentiment
0,The one is very excellent I have used many Blu...,"balanced sound,poor bose,technical team",appreciation,POSITIVE
1,I have been using it from months extensivelyIt...,"portable speaker,sound quality,high portabilit...",information,POSITIVE
2,Review after a day of full usagea Bose seems t...,"full usagea,sweet spot,ii generation,heavy bas...",information,NEGATIVE
3,So the clarity is crisp you can hear the sound...,sound quality,information,POSITIVE
4,Costly but the best blutooth speaker You will ...,best blutooth,appreciation,POSITIVE


In [13]:
all_keyword = []
for key_str in tqdm(df.topic):
#     print(key_str)
    splited_str = key_str.split(',')
    if len(splited_str)>0:
        all_keyword += splited_str
#     break

all_keyword

100%|███████████████████████████████████████| 93/93 [00:00<00:00, 237413.43it/s]


['balanced sound',
 'poor bose',
 'technical team',
 'portable speaker',
 'sound quality',
 'high portability',
 'sound quality',
 'perfect trip',
 'full volume',
 'full week',
 'daily music',
 'worth money',
 'perfect condition',
 'full price',
 'individual perception',
 'full usagea',
 'sweet spot',
 'ii generation',
 'heavy bass',
 'equaliser source',
 'sound punchyb',
 'old song',
 'bt connectivity',
 'bt version',
 'basic equilizer',
 'timesoverall verdict',
 'basic job',
 'sound quality',
 'best blutooth',
 'much product',
 'worth shipment',
 'outer box',
 'thin plastic',
 'rough handling',
 'outer cardboard',
 'poor experience',
 'beautiful box',
 'good product',
 'perfect box',
 'original factory',
 'good deal',
 'little beauty',
 'vividh bharti',
 'marvellous sound',
 'transparent sound',
 'good sound',
 'little genius',
 'sound clarity',
 'transparent quality',
 'tiny speaker',
 'treble bass',
 'foolish compare',
 'bose sound',
 'twice music',
 'loud election',
 'sound produc

In [14]:
freq_count = {}
for key in all_keyword:
    if key in freq_count:
        freq_count[key] +=1
    else:
        freq_count[key] = 1

In [15]:
len(freq_count)

264

In [16]:
freq_count

{'balanced sound': 2,
 'poor bose': 1,
 'technical team': 1,
 'portable speaker': 7,
 'sound quality': 27,
 'high portability': 1,
 'perfect trip': 1,
 'full volume': 1,
 'full week': 1,
 'daily music': 1,
 'worth money': 2,
 'perfect condition': 2,
 'full price': 1,
 'individual perception': 1,
 'full usagea': 1,
 'sweet spot': 1,
 'ii generation': 1,
 'heavy bass': 1,
 'equaliser source': 1,
 'sound punchyb': 1,
 'old song': 1,
 'bt connectivity': 1,
 'bt version': 1,
 'basic equilizer': 1,
 'timesoverall verdict': 1,
 'basic job': 1,
 'best blutooth': 1,
 'much product': 1,
 'worth shipment': 1,
 'outer box': 1,
 'thin plastic': 1,
 'rough handling': 1,
 'outer cardboard': 1,
 'poor experience': 1,
 'beautiful box': 1,
 'good product': 1,
 'perfect box': 1,
 'original factory': 1,
 'good deal': 1,
 'little beauty': 1,
 'vividh bharti': 1,
 'marvellous sound': 1,
 'transparent sound': 1,
 'good sound': 2,
 'little genius': 1,
 'sound clarity': 2,
 'transparent quality': 1,
 'tiny spe

In [17]:
unique_keywords = freq_count.keys()
unique_keywords

dict_keys(['balanced sound', 'poor bose', 'technical team', 'portable speaker', 'sound quality', 'high portability', 'perfect trip', 'full volume', 'full week', 'daily music', 'worth money', 'perfect condition', 'full price', 'individual perception', 'full usagea', 'sweet spot', 'ii generation', 'heavy bass', 'equaliser source', 'sound punchyb', 'old song', 'bt connectivity', 'bt version', 'basic equilizer', 'timesoverall verdict', 'basic job', 'best blutooth', 'much product', 'worth shipment', 'outer box', 'thin plastic', 'rough handling', 'outer cardboard', 'poor experience', 'beautiful box', 'good product', 'perfect box', 'original factory', 'good deal', 'little beauty', 'vividh bharti', 'marvellous sound', 'transparent sound', 'good sound', 'little genius', 'sound clarity', 'transparent quality', 'tiny speaker', 'treble bass', 'foolish compare', 'bose sound', 'twice music', 'loud election', 'sound product', 'worthy product', 'excellent quality', 'exceptional quality', 'full sound',

In [55]:
clustures = []

used_keywords = []

for i in tqdm(unique_keywords):
    current_custer = [i]
    
    if (i in used_keywords):
        continue
    
    used_keywords.append(i)
    
    
    for j in unique_keywords:
        if  (j in used_keywords):
            continue
    
        current_score = similarity_calc(i,j)
        if current_score>0.7:
            current_custer.append(j)
            used_keywords.append(j)
            
    
    clustures.append(current_custer)
        
#     break

100%|█████████████████████████████████████████| 264/264 [05:34<00:00,  1.27s/it]


In [56]:
len(clustures)

160

In [57]:
clustures[:10]

[['balanced sound',
  'sound quality',
  'good sound',
  'sound clarity',
  'bose sound',
  'sound product',
  'full sound',
  'clear sound',
  'great sound',
  'crisp sound',
  'natural sound',
  'best sound',
  'quick sound',
  'sound type',
  'audio quality',
  'instrument sound',
  'sound system'],
 ['poor bose', 'good bose'],
 ['technical team'],
 ['portable speaker',
  'expensive speaker',
  'audio tech',
  'bose speaker',
  'small speaker',
  'connect speaker'],
 ['high portability'],
 ['perfect trip'],
 ['full volume', 'high volume', 'treble volume', 'moderate volume'],
 ['full week'],
 ['daily music',
  'twice music',
  'natural music',
  'overall music',
  'useful music',
  'amazing music',
  'bose music'],
 ['worth money',
  'worth investment',
  'good amount',
  'worth extent',
  'worth amount']]

In [123]:
final_result = []
for cp in clustures:
    cluster_key_count = {}
    for key_c in cp:
        cluster_key_count[key_c] = freq_count[key_c]
    
    items = []
    for i in list(tuple(cluster_key_count.keys())):
        items.append(i.split()[0])
    dict1 = {}
    for item in items:
        if not item in dict1:
            dict1[item] = items.count(item)
    dict1 = sorted(dict1.keys(), reverse=True)[0]

    curr_cul_keys = list(cluster_key_count.keys())
    curr_cul_values = list(cluster_key_count.values())
    sum_of_similar_words = np.sum(curr_cul_values)
    highest_count_key = curr_cul_keys[np.argmax(curr_cul_values)]
    highest_count_value = np.max(curr_cul_values)
    final_result.append([cluster_key_count,sum_of_similar_words,highest_count_key,len(cluster_key_count), dict1])
    
#     break
        

In [124]:
final_result

[[{'balanced sound': 2,
   'sound quality': 27,
   'good sound': 2,
   'sound clarity': 2,
   'bose sound': 1,
   'sound product': 1,
   'full sound': 1,
   'clear sound': 2,
   'great sound': 4,
   'crisp sound': 1,
   'natural sound': 1,
   'best sound': 1,
   'quick sound': 1,
   'sound type': 1,
   'audio quality': 1,
   'instrument sound': 1,
   'sound system': 1},
  50,
  'sound quality',
  17,
  'sound'],
 [{'poor bose': 1, 'good bose': 1}, 2, 'poor bose', 2, 'poor'],
 [{'technical team': 1}, 1, 'technical team', 1, 'technical'],
 [{'portable speaker': 7,
   'expensive speaker': 1,
   'audio tech': 1,
   'bose speaker': 2,
   'small speaker': 1,
   'connect speaker': 1},
  13,
  'portable speaker',
  6,
  'small'],
 [{'high portability': 1}, 1, 'high portability', 1, 'high'],
 [{'perfect trip': 1}, 1, 'perfect trip', 1, 'perfect'],
 [{'full volume': 1,
   'high volume': 2,
   'treble volume': 1,
   'moderate volume': 1},
  5,
  'high volume',
  4,
  'treble'],
 [{'full week': 1}

In [125]:
df = pd.DataFrame(final_result,columns=["similar_words_and_count","count_of_similar_words","most_used_similar_word","total_similar_words", "key_expressions"])

In [126]:
sorted_df = df.sort_values('total_similar_words',ascending=False).reset_index(drop=True)

In [127]:
sorted_df

Unnamed: 0,similar_words_and_count,count_of_similar_words,most_used_similar_word,total_similar_words,key_expressions
0,"{'balanced sound': 2, 'sound quality': 27, 'go...",50,sound quality,17,sound
1,"{'good product': 1, 'worthy product': 1, 'exce...",15,bose product,11,worthy
2,"{'heavy bass': 1, 'treble bass': 1, 'equalized...",10,much bass,8,treble
3,"{'full price': 1, 'different price': 1, 'corre...",7,full price,7,ok
4,"{'daily music': 1, 'twice music': 1, 'natural ...",7,daily music,7,useful
...,...,...,...,...,...
155,{'previous version': 1},1,previous version,1,previous
156,{'good predecessor': 1},1,good predecessor,1,good
157,{'absolute beast': 1},1,absolute beast,1,absolute
158,{'trebble base': 1},1,trebble base,1,trebble


In [128]:
sorted_df.to_csv("/home/heptagon/Desktop/nps_review_analysis/most_keywords.csv", index=False)

In [129]:
print((sorted_df.most_used_similar_word).to_list()[:20])

['sound quality', 'bose product', 'much bass', 'full price', 'daily music', 'portable speaker', 'sound punchyb', 'exceptional quality', 'worth money', 'high volume', 'good deal', 'much product', 'top speaker', 'satisfied speaker', 'good clarity', 'outer box', 'bt connectivity', 'good battery', 'portable home', 'original factory']
