In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings("ignore")


In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min = 1e-9)


def embeding_calc(sentences):

    # tokenizing
    encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = 'pt')

    # passing though model
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings



In [6]:
result = embeding_calc(['balanced sound','good sound'])
result

tensor([[ 1.3816e-01, -2.8183e-01,  4.8961e-02, -5.4833e-01, -4.4829e-01,
         -9.4074e-02,  1.8008e-02, -2.3150e-01, -1.0735e-01, -1.7051e-01,
          8.0055e-02,  1.6925e-01,  3.1769e-01, -1.9897e-01,  7.5435e-02,
          2.0244e-01,  4.0411e-01,  3.6702e-01, -2.6712e-01,  2.2276e-01,
         -8.3083e-02,  1.1955e-01, -2.5500e-01,  1.7094e-01, -2.5472e-01,
          3.7326e-01, -4.4425e-01, -1.0340e-01,  1.6339e-01, -1.6743e-01,
          2.5092e-01,  1.5527e-02,  4.6127e-01, -4.6026e-01, -4.9595e-01,
         -2.3598e-01,  2.2898e-01, -1.0325e-01, -5.1157e-01,  2.9932e-01,
         -5.6495e-02,  2.5054e-01,  3.5081e-02, -2.2856e-01, -1.1837e-01,
         -5.3377e-02, -4.1719e-01,  1.4727e-01, -2.5829e-02,  1.5994e-01,
         -3.5641e-01,  1.6821e-01,  5.6168e-02,  1.6326e-01, -4.2926e-01,
          3.2207e-02,  2.7500e-01,  6.7067e-01, -2.5572e-01,  5.8163e-02,
          7.0676e-02, -2.5975e-01, -2.1105e-01,  2.2047e-01, -9.3347e-02,
          6.2570e-01,  1.9075e-01,  8.

In [5]:
len(result)

2

In [6]:
score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))

In [7]:
score[0][0]

0.7417778

In [8]:
def similarity_calc(input1,input2):
    result = embeding_calc([input1,input2])
    score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))
    return score[0][0]

In [9]:
similarity_calc('balanced sound','poor bose')

0.16619003

# Data processing

In [10]:
df = pd.read_csv('/home/heptagon/Desktop/nps_review_analysis/dataset/amazon_mobile_review_topic.csv')
df.head()

Unnamed: 0,comments,topic,sentiment,type
0,The phone looks like it was brand new everythi...,"phone look, no scratch, scratch shipping, arri...",POSITIVE,information
1,This shipped so fast and my son was so excited...,"shipped fast excited son, great condition phon...",POSITIVE,appreciation
2,I am very pleased with my phone It was in grea...,pleased great working affordable shape,POSITIVE,appreciation
3,First off this is NOT a verizon phone I put th...,"verizon phone, verizon sim card, cell service,...",NEGATIVE,information
4,This phone is advertised as Verizon it DOES NO...,"verizon account, huge waste time, time money",NEGATIVE,information


In [11]:
all_keyword = []
for key_str in tqdm(df.topic):
#     print(key_str)
    splited_str = key_str.split(',')
    if len(splited_str)>0:
        all_keyword += splited_str
#     break

all_keyword

100%|█████████████████████████████████████| 314/314 [00:00<00:00, 258186.92it/s]


['phone look',
 ' no scratch',
 ' scratch shipping',
 ' arrived time',
 ' great unlocked work',
 ' perfect mexican carrier',
 ' sure using problem',
 ' u mobile',
 'shipped fast excited son',
 ' great condition phone',
 ' money thanks',
 ' fast shipping',
 'pleased great working affordable shape',
 'verizon phone',
 ' verizon sim card',
 ' cell service',
 ' serial number',
 ' agian phone',
 'verizon account',
 ' huge waste time',
 ' time money',
 'nice phone paid price',
 ' sent review',
 'gizmo trader',
 ' scratch good company',
 ' described better product',
 'button mess',
 ' mess lot',
 'great seller',
 ' work version',
 ' many version problem',
 'apple product',
 ' product rep verizon store',
 ' phone verizon',
 ' verizon system',
 ' closer inspection',
 ' home button',
 ' correct center',
 ' center body phone',
 ' dealing phone process',
 ' refund purchase',
 ' different supplier',
 'great minor scratch',
 ' fine battery short life',
 ' daily overall good major problem charger',
 

In [12]:
freq_count = {}
for key in all_keyword:
    if key in freq_count:
        freq_count[key] +=1
    else:
        freq_count[key] = 1

In [13]:
len(freq_count)

993

In [14]:
freq_count

{'phone look': 1,
 ' no scratch': 4,
 ' scratch shipping': 1,
 ' arrived time': 1,
 ' great unlocked work': 1,
 ' perfect mexican carrier': 1,
 ' sure using problem': 1,
 ' u mobile': 1,
 'shipped fast excited son': 1,
 ' great condition phone': 1,
 ' money thanks': 1,
 ' fast shipping': 1,
 'pleased great working affordable shape': 1,
 'verizon phone': 1,
 ' verizon sim card': 1,
 ' cell service': 1,
 ' serial number': 1,
 ' agian phone': 1,
 'verizon account': 1,
 ' huge waste time': 1,
 ' time money': 1,
 'nice phone paid price': 1,
 ' sent review': 1,
 'gizmo trader': 1,
 ' scratch good company': 1,
 ' described better product': 1,
 'button mess': 1,
 ' mess lot': 1,
 'great seller': 1,
 ' work version': 1,
 ' many version problem': 1,
 'apple product': 1,
 ' product rep verizon store': 1,
 ' phone verizon': 1,
 ' verizon system': 1,
 ' closer inspection': 1,
 ' home button': 1,
 ' correct center': 1,
 ' center body phone': 1,
 ' dealing phone process': 1,
 ' refund purchase': 1,
 

In [15]:
unique_keywords = freq_count.keys()
unique_keywords

dict_keys(['phone look', ' no scratch', ' scratch shipping', ' arrived time', ' great unlocked work', ' perfect mexican carrier', ' sure using problem', ' u mobile', 'shipped fast excited son', ' great condition phone', ' money thanks', ' fast shipping', 'pleased great working affordable shape', 'verizon phone', ' verizon sim card', ' cell service', ' serial number', ' agian phone', 'verizon account', ' huge waste time', ' time money', 'nice phone paid price', ' sent review', 'gizmo trader', ' scratch good company', ' described better product', 'button mess', ' mess lot', 'great seller', ' work version', ' many version problem', 'apple product', ' product rep verizon store', ' phone verizon', ' verizon system', ' closer inspection', ' home button', ' correct center', ' center body phone', ' dealing phone process', ' refund purchase', ' different supplier', 'great minor scratch', ' fine battery short life', ' daily overall good major problem charger', ' phone work', ' free second one', 

In [16]:
clustures = []

used_keywords = []

for i in tqdm(unique_keywords):
    current_custer = [i]
    
    if (i in used_keywords):
        continue
    
    used_keywords.append(i)
    
    
    for j in unique_keywords:
        if  (j in used_keywords):
            continue
    
        current_score = similarity_calc(i,j)
        if current_score>0.7:
            current_custer.append(j)
            used_keywords.append(j)
            
    
    clustures.append(current_custer)
        
#     break

100%|███████████████████████████████████████| 993/993 [1:27:40<00:00,  5.30s/it]


In [17]:
len(clustures)

571

In [18]:
clustures[:10]

[['phone look',
  ' agian phone',
  ' phone verizon',
  ' phone work',
  'phone type',
  ' phone screen',
  'first phone',
  'received phone',
  'phone work',
  'phone place',
  'needed phone',
  ' phone people',
  ' use phone',
  'phone line',
  ' another phone',
  'good phone',
  ' phone overall look',
  'phone use',
  ' basic phone',
  'cell phone',
  ' ok phone',
  'second phone',
  'phone feature',
  ' phone apps',
  ' application phone',
  ' apps phone',
  ' internet phone',
  ' wanted phone',
  ' actual phone',
  ' phone trip',
  ' place phone look',
  'phone screen',
  ' call cell phone',
  ' purpose phone',
  ' cell phone'],
 [' no scratch',
  'great minor scratch',
  'scratch work',
  ' no scrape',
  ' no scratch front',
  ' no scratch crack',
  ' terrible scratch',
  ' no scratch programming work'],
 [' scratch shipping'],
 [' arrived time', ' time order', 'time trip'],
 [' great unlocked work', ' great work', 'great work'],
 [' perfect mexican carrier'],
 [' sure using prob

In [19]:
final_result = []
for cp in clustures:
    cluster_key_count = {}
    for key_c in cp:
        cluster_key_count[key_c] = freq_count[key_c]
    
    items = []
    for i in list(tuple(cluster_key_count.keys())):
        items.append(i.split()[0])
    dict1 = {}
    for item in items:
        if not item in dict1:
            dict1[item] = items.count(item)
    dict1 = sorted(dict1.keys(), reverse=True)[0]

    curr_cul_keys = list(cluster_key_count.keys())
    curr_cul_values = list(cluster_key_count.values())
    sum_of_similar_words = np.sum(curr_cul_values)
    highest_count_key = curr_cul_keys[np.argmax(curr_cul_values)]
    highest_count_value = np.max(curr_cul_values)
    final_result.append([cluster_key_count,sum_of_similar_words,highest_count_key,len(cluster_key_count), dict1])
    
#     break
        

In [20]:
final_result

[[{'phone look': 1,
   ' agian phone': 1,
   ' phone verizon': 1,
   ' phone work': 2,
   'phone type': 1,
   ' phone screen': 4,
   'first phone': 1,
   'received phone': 1,
   'phone work': 4,
   'phone place': 1,
   'needed phone': 1,
   ' phone people': 2,
   ' use phone': 2,
   'phone line': 1,
   ' another phone': 1,
   'good phone': 1,
   ' phone overall look': 1,
   'phone use': 1,
   ' basic phone': 1,
   'cell phone': 1,
   ' ok phone': 1,
   'second phone': 1,
   'phone feature': 1,
   ' phone apps': 1,
   ' application phone': 1,
   ' apps phone': 1,
   ' internet phone': 1,
   ' wanted phone': 1,
   ' actual phone': 1,
   ' phone trip': 1,
   ' place phone look': 1,
   'phone screen': 1,
   ' call cell phone': 1,
   ' purpose phone': 1,
   ' cell phone': 1},
  44,
  ' phone screen',
  35,
  'wanted'],
 [{' no scratch': 4,
   'great minor scratch': 1,
   'scratch work': 1,
   ' no scrape': 2,
   ' no scratch front': 1,
   ' no scratch crack': 1,
   ' terrible scratch': 1,
 

In [21]:
df = pd.DataFrame(final_result,columns=["similar_words_and_count","count_of_similar_words","most_used_similar_word","total_similar_words", "associated_words"])

In [22]:
sorted_df = df.sort_values('total_similar_words',ascending=False).reset_index(drop=True)

In [23]:
sorted_df

Unnamed: 0,similar_words_and_count,count_of_similar_words,most_used_similar_word,total_similar_words,associated_words
0,"{'phone look': 1, ' agian phone': 1, ' phone v...",44,phone screen,35,wanted
1,"{' great condition phone': 1, 'phone perfect':...",28,awesome phone mine,25,thrilled
2,"{'great seller': 1, ' excellent price': 1, ' s...",22,great seller brand,21,use
3,"{'apple product': 1, 'advertised apple iphone'...",24,apple store,18,phone
4,"{' old charger': 1, ' either charger': 1, ' ch...",18,new charger,12,plugged
...,...,...,...,...,...
566,{' mine home apple': 1},1,mine home apple,1,mine
567,{' stand use': 1},1,stand use,1,stand
568,{' iphone birthday': 1},1,iphone birthday,1,iphone
569,{' n thing wife': 1},1,n thing wife,1,n


In [24]:
sorted_df.to_csv("/home/heptagon/Desktop/nps_review_analysis/dataset/amazon_mobile_review_associated_words.csv", index=False)

In [25]:
print((sorted_df.most_used_similar_word).to_list()[:20])

[' phone screen', 'awesome phone mine', ' great seller brand', ' apple store', ' new charger', 'good condition', 'great condition work', 'iphone manner phone work', ' paid return seller', 'purchasing experience', ' no scratch', ' sim card', ' new phone', ' download apps', ' telephone company', ' new speedy shipping', ' bad battery', ' great product', ' phone seller', ' deal price']


In [26]:
print((sorted_df.associated_words).to_list()[:20])

['wanted', 'thrilled', 'use', 'phone', 'plugged', 'owned', 'new', 'verizon', 'returned', 'unlocked', 'terrible', 'verizon', 'update', 'system', 'telephone', 'super', 'worse', 'product', 'seller', 'plenty']


In [27]:
'''

['basic earbuds', 'sound quality', 'battery backup', 'basic product', 'no bass', 'product amazon',
'great price', 'noise cancellation', 'good quality', 'connectivity issue', 'decent decent sound experience',
'price point', 'bluetooth connectivity', 'awesome product rangebecause', 'low quality', 'affordable price',
'replacement issue', 'value money product', 'good deal', 'bulky size']

credit_union_of_new_jersey_detail_review
['credit union', 'great experience', 'loan process', 'great customer service', 'informative respectful
courteous efficient', 'person phone', 'staff work', 'account number', 'auto loan', 'best local bank',
'sure email', 'someone assistance', 'account change', 'prior appointment', 'bank guy', 'excellent amazing
attentive customer service', 'mortgage issue', 'helpful kind', 'person office', 'minute experience']

carousel_checks_detail_review
['new check', 'check order', 'carousel check', 'easy order', 'great price', 'customer service', 'great quality',
'fast delivery', 'bank address', 'orignial photo', 'great service', 'good price', 'correct information',
'check company', 'check photo', 'check cent', 'super fast delivery', 'duplicate check paper', 'time frame',
'repeat customer']

leaf_commercial_capital_inc_detail_review
['great job', 'leaf year', 'equipment business', 'simple loan', 'commercial financing', 'use leaf',
'loan representative', 'pleasure work', 'great company work', 'happy leaf', 'u equipment lease',
'funding department', 'u time', 'step way', 'easy quick business', 'leaf team', 'leaf time',
'usual eric amazing job', 'sure happy transaction', 'end lease']
    
nexo_detail_review
['crypto currency', 'current crypto environment', 'nexo support manager', 'nexo support', 'customer service',
 'quick useful customer service', 'nexo place', 'account use', 'support trust', 'best platform',
 'excellent customer service', 'easy use', 'nexo claim', 'best customer service', 'great experience nexo',
 'great support', 'external wallet', 'consideration experience', 'satisfied nexo point', 'service team']

By Current Algo
['basic earbuds', 'sound quality', 'battery backup', 'basic product', 'no bass', 'product amazon',
'great price', 'noise cancellation', 'good quality', 'connectivity issue', 'decent decent sound experience',
'price point', 'bluetooth connectivity', 'awesome product rangebecause', 'low quality', 'affordable price',
'replacement issue', 'value money product', 'good deal', 'bulky size']
 '''

"\n\n['basic earbuds', 'sound quality', 'battery backup', 'basic product', 'no bass', 'product amazon',\n'great price', 'noise cancellation', 'good quality', 'connectivity issue', 'decent decent sound experience',\n'price point', 'bluetooth connectivity', 'awesome product rangebecause', 'low quality', 'affordable price',\n'replacement issue', 'value money product', 'good deal', 'bulky size']\n\ncredit_union_of_new_jersey_detail_review\n['credit union', 'great experience', 'loan process', 'great customer service', 'informative respectful\ncourteous efficient', 'person phone', 'staff work', 'account number', 'auto loan', 'best local bank',\n'sure email', 'someone assistance', 'account change', 'prior appointment', 'bank guy', 'excellent amazing\nattentive customer service', 'mortgage issue', 'helpful kind', 'person office', 'minute experience']\n\ncarousel_checks_detail_review\n['new check', 'check order', 'carousel check', 'easy order', 'great price', 'customer service', 'great quality'

In [28]:
# Problem Statement : Now can you find out the keywords or expressions related to these keywords
#   Ex: Battery: Good, bad, slow charging etc, for wach keyword


    


#  https://www.analyticsvidhya.com/blog/2020/06/nlp-project-information-extraction/#h2_11

# two main task is in priority 1. keyword algo 2. associated words
# Plan : To Improve Main Keyword Algo and get associated words
# Try old algo (may get improved keywords) and few new ago also - Need 2 days to improve algo
# will try 2/3 algo to get associated words : also need 2/3 days

# we are getting list of keywords so
# 1. Implemanted : Currently taking 1st most word from the keywords
# 2. Working : To apply a specific Algo on entire datasets
# 3. Working : Also thinking the pattern matching, it will return entire sentence not a specific words. (as i noted on amazon website)
#     way : Extract sentense associated with keywords then apply the that method
