In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings("ignore")


In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min = 1e-9)


def embeding_calc(sentences):

    # tokenizing
    encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = 'pt')

    # passing though model
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings



In [4]:
result = embeding_calc(['balanced sound','good sound'])

In [5]:
len(result)

2

In [6]:
score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))

In [7]:
score[0][0]

0.7417778

In [8]:
def similarity_calc(input1,input2):
    result = embeding_calc([input1,input2])
    score = cosine_similarity(result[0].reshape(1, -1),result[1].reshape(1, -1))
    return score[0][0]

In [9]:
similarity_calc('balanced sound','poor bose')

0.16619003

# Data processing

In [30]:
df = pd.read_csv('/home/heptagon/Desktop/nps_review_analysis/dataset/network_capital_funding_detail_review_topic.csv')
df.head()

Unnamed: 0,comments,topic,sentiment,type
0,Working with Eddie Claus was a great experienc...,"great experience,previous loan,left company",NEGATIVE,information
1,Deiago Lorenzo excellent service My wife just ...,"excellent service,whole process,whole progress",POSITIVE,appreciation
2,Our experience with Network Capital was awesom...,knowledgeable job,POSITIVE,information
3,I recently expirenced an online home loan refi...,"online home,nervous process,technical skill,pa...",POSITIVE,information
4,Chris our processor was friendly personable an...,"hard team,stressful side,quick process",POSITIVE,information


In [13]:
all_keyword = []
for key_str in tqdm(df.topic):
#     print(key_str)
    splited_str = key_str.split(',')
    if len(splited_str)>0:
        all_keyword += splited_str
#     break

all_keyword

100%|█████████████████████████████████████| 315/315 [00:00<00:00, 333469.40it/s]


['great experience',
 'previous loan',
 'left company',
 'excellent service',
 'whole process',
 'whole progress',
 'knowledgeable job',
 'online home',
 'nervous process',
 'technical skill',
 'patient question',
 'best way',
 'entire process',
 'hard team',
 'stressful side',
 'quick process',
 'many year',
 'married home',
 'quick mortgage',
 'close contact',
 'great guy',
 'recommend service',
 'crazy money',
 'top notch',
 'long relationship',
 'front line',
 'best network',
 'overall experience',
 'much norm',
 'appraiser return',
 'round trip',
 'expensive amount',
 'original visit',
 'great word',
 'due way',
 'available answer',
 'upbeat attitude',
 'pleasing voice',
 'many year',
 'best knowledge',
 'great human',
 'nice lady',
 'final process',
 'future assistance',
 'shortest time',
 'best outcome',
 'accurate information',
 'good work',
 'new property',
 'whole package',
 'fantastic work',
 'much leg',
 'indispensable process',
 'great rate',
 'courteous deliver',
 'whole 

In [14]:
freq_count = {}
for key in all_keyword:
    if key in freq_count:
        freq_count[key] +=1
    else:
        freq_count[key] = 1

In [15]:
len(freq_count)

751

In [16]:
freq_count

{'great experience': 10,
 'previous loan': 1,
 'left company': 1,
 'excellent service': 9,
 'whole process': 19,
 'whole progress': 1,
 'knowledgeable job': 1,
 'online home': 1,
 'nervous process': 1,
 'technical skill': 1,
 'patient question': 1,
 'best way': 1,
 'entire process': 15,
 'hard team': 1,
 'stressful side': 1,
 'quick process': 1,
 'many year': 3,
 'married home': 1,
 'quick mortgage': 1,
 'close contact': 1,
 'great guy': 3,
 'recommend service': 1,
 'crazy money': 1,
 'top notch': 3,
 'long relationship': 1,
 'front line': 1,
 'best network': 1,
 'overall experience': 3,
 'much norm': 1,
 'appraiser return': 1,
 'round trip': 1,
 'expensive amount': 1,
 'original visit': 1,
 'great word': 1,
 'due way': 1,
 'available answer': 2,
 'upbeat attitude': 1,
 'pleasing voice': 1,
 'best knowledge': 1,
 'great human': 1,
 'nice lady': 1,
 'final process': 1,
 'future assistance': 1,
 'shortest time': 1,
 'best outcome': 1,
 'accurate information': 1,
 'good work': 2,
 'new pr

In [17]:
unique_keywords = freq_count.keys()
unique_keywords

dict_keys(['great experience', 'previous loan', 'left company', 'excellent service', 'whole process', 'whole progress', 'knowledgeable job', 'online home', 'nervous process', 'technical skill', 'patient question', 'best way', 'entire process', 'hard team', 'stressful side', 'quick process', 'many year', 'married home', 'quick mortgage', 'close contact', 'great guy', 'recommend service', 'crazy money', 'top notch', 'long relationship', 'front line', 'best network', 'overall experience', 'much norm', 'appraiser return', 'round trip', 'expensive amount', 'original visit', 'great word', 'due way', 'available answer', 'upbeat attitude', 'pleasing voice', 'best knowledge', 'great human', 'nice lady', 'final process', 'future assistance', 'shortest time', 'best outcome', 'accurate information', 'good work', 'new property', 'whole package', 'fantastic work', 'much leg', 'indispensable process', 'great rate', 'courteous deliver', 'courteous professional', 'full detail', 'right angel', 'whole ti

In [18]:
clustures = []

used_keywords = []

for i in tqdm(unique_keywords):
    current_custer = [i]
    
    if (i in used_keywords):
        continue
    
    used_keywords.append(i)
    
    
    for j in unique_keywords:
        if  (j in used_keywords):
            continue
    
        current_score = similarity_calc(i,j)
        if current_score>0.7:
            current_custer.append(j)
            used_keywords.append(j)
            
    
    clustures.append(current_custer)
        
#     break

100%|█████████████████████████████████████████| 751/751 [46:03<00:00,  3.68s/it]


In [19]:
len(clustures)

384

In [20]:
clustures[:10]

[['great experience',
  'overall experience',
  'great job',
  'wonderful experience',
  'excellent experience',
  'current experience',
  'marvelous job',
  'best experience',
  'extraordinary experience',
  'incredible job',
  'great service',
  'several experience',
  'positive experience',
  'awesome experience',
  'wonderful job',
  'great result',
  'excellent job',
  'fantastic experience',
  'amazing experience',
  'pleasant experience',
  'great thing',
  'entire experience',
  'helpful experience',
  'awesome job',
  'successful experience',
  'good experience',
  'fantastic job',
  'genuine experience'],
 ['previous loan',
  'qualified loan',
  'senior loan',
  'best loan',
  'perfect loan',
  'total loan',
  'unorganized loan',
  'complete loan',
  'sure loan',
  'necessary loan',
  'interested loan',
  'new loan',
  'abreast loan',
  'previous lender',
  'current lender',
  'previous refinance',
  'informed loan',
  'conventional loan',
  'previous mortgage'],
 ['left comp

In [21]:
final_result = []
for cp in clustures:
    cluster_key_count = {}
    for key_c in cp:
        cluster_key_count[key_c] = freq_count[key_c]
    
    items = []
    for i in list(tuple(cluster_key_count.keys())):
        items.append(i.split()[0])
    dict1 = {}
    for item in items:
        if not item in dict1:
            dict1[item] = items.count(item)
    dict1 = sorted(dict1.keys(), reverse=True)[0]

    curr_cul_keys = list(cluster_key_count.keys())
    curr_cul_values = list(cluster_key_count.values())
    sum_of_similar_words = np.sum(curr_cul_values)
    highest_count_key = curr_cul_keys[np.argmax(curr_cul_values)]
    highest_count_value = np.max(curr_cul_values)
    final_result.append([cluster_key_count,sum_of_similar_words,highest_count_key,len(cluster_key_count), dict1])
    
#     break
        

In [22]:
final_result

[[{'great experience': 10,
   'overall experience': 3,
   'great job': 9,
   'wonderful experience': 5,
   'excellent experience': 2,
   'current experience': 1,
   'marvelous job': 1,
   'best experience': 3,
   'extraordinary experience': 1,
   'incredible job': 1,
   'great service': 4,
   'several experience': 1,
   'positive experience': 2,
   'awesome experience': 1,
   'wonderful job': 2,
   'great result': 1,
   'excellent job': 1,
   'fantastic experience': 1,
   'amazing experience': 2,
   'pleasant experience': 4,
   'great thing': 1,
   'entire experience': 1,
   'helpful experience': 1,
   'awesome job': 1,
   'successful experience': 2,
   'good experience': 4,
   'fantastic job': 1,
   'genuine experience': 1},
  67,
  'great experience',
  28,
  'wonderful'],
 [{'previous loan': 1,
   'qualified loan': 1,
   'senior loan': 2,
   'best loan': 4,
   'perfect loan': 1,
   'total loan': 1,
   'unorganized loan': 1,
   'complete loan': 1,
   'sure loan': 1,
   'necessary loa

In [23]:
df = pd.DataFrame(final_result,columns=["similar_words_and_count","count_of_similar_words","most_used_similar_word","total_similar_words", "associated_words"])

In [24]:
sorted_df = df.sort_values('total_similar_words',ascending=False).reset_index(drop=True)

In [25]:
sorted_df

Unnamed: 0,similar_words_and_count,count_of_similar_words,most_used_similar_word,total_similar_words,associated_words
0,"{'great experience': 10, 'overall experience':...",67,great experience,28,wonderful
1,"{'previous loan': 1, 'qualified loan': 1, 'sen...",24,best loan,19,unorganized
2,"{'quick mortgage': 1, 'future mortgage': 1, 'n...",16,quick mortgage,16,right
3,"{'professional job': 1, 'best job': 2, 'profes...",16,best job,15,well
4,"{'good work': 2, 'fantastic work': 3, 'good th...",35,great work,15,wonderful
...,...,...,...,...,...
379,{'best lot': 1},1,best lot,1,best
380,{'amazing people': 2},2,amazing people,1,amazing
381,{'difficult circumstance': 1},1,difficult circumstance,1,difficult
382,{'difficult set': 1},1,difficult set,1,difficult


In [26]:
sorted_df.to_csv("/home/heptagon/Desktop/nps_review_analysis/dataset/network_capital_funding_detail_review_words.csv", index=False)

In [27]:
print((sorted_df.most_used_similar_word).to_list()[:20])

['great experience', 'best loan', 'quick mortgage', 'best job', 'great work', 'excellent service', 'whole process', 'easy thank', 'great company', 'great team', 'thankful way', 'current refinance', 'easiest refi', 'courteous deliver', 'stressful process', 'great rate', 'recommend network', 'good communication', 'best knowledge', 'good job']


In [28]:
print((sorted_df.associated_words).to_list()[:20])

['wonderful', 'unorganized', 'right', 'well', 'wonderful', 'satisfied', 'whole', 'special', 'wonderful', 'whole', 'thankful', 'twice', 'several', 'sincere', 'stressful', 'highest', 'tremendous', 'streamlined', 'sure', 'knowledgeable']


In [29]:
# Problem Statement : Now can you find out the keywords or expressions related to these keywords
#   Ex: Battery: Good, bad, slow charging etc, for wach keyword

# we are getting list of keywords so
# 1. currently taking 1st most word from the keywords
# 2. Thinking : To apply a specific Algo on entire datasets
# 3. Also thinking the pattern matching but, it will return entire sentence not a specific words.


#  https://www.analyticsvidhya.com/blog/2020/06/nlp-project-information-extraction/#h2_11

###  Associated words using patterns

In [38]:
import spacy

In [41]:
def rule3(text): 
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)    
    sent = []
    for token in doc: 
#         print("Token : ",token)

        if token.pos_=='ADP':    # look for prepositions
            phrase = ''      
            
            if token.head.pos_=='NOUN':   # if its head word is a noun
                phrase += token.head.text    # append noun and preposition to phrase
                phrase += ' '+token.text

                for right_tok in token.rights:    # check the nodes to the right of the preposition
                    if (right_tok.pos_ in ['NOUN','PROPN']):    # append if it is a noun or proper noun
                        phrase += ' '+right_tok.text
                if len(phrase)>5:
                    sent.append(phrase)
                
    return sent

In [42]:
for text in df.comments:
    print(rule3(text))

[]
['asset to operation']
['experience with Capital']
['refinance with Fundinding', 'step of way', 'questions about documents', 'employees of Funding']
['progress of refinance', 'side of things']
['transactions over years', 'contact with']
[]
['cost of Appraisal', 'amount of visit']
[]
['transactions over years']
['knowledge on']
['pleasure of', 'processing of loan']
['process for', 'refinance for property']
['efforts in', 'purchase of property', 'property in Beach', 'package of loans']
[]
[]
['dealings with']
[]
['approval for repairs', 'part of reason', 'capital of']
['results of refinance']
['spite of obstacles', 'bulls with bone', 'things about']
[]
['Experience with Lily', 'fault of']
['others behind scene', 'Thanks for Help']
['difference in lives']
[]
[]
['time in life']
['process of Capital', 'asset to company']
['processing of loan']
['lack of communication', 'calls from Melissa']
['processing with financer', 'execution of process']
[]
['need of services']
[]
[]
['things about

['services of capital', 'team of M', 'group of guys', 'firm of capital', 'thanks on behalf', 'behalf of Cassell']
['interactions with Zac']
['information With mortgages']
['transition in matter']
['interest as']
[]
['try with rating']
[]
[]
['none of companies', 'representative for company']
['experience for loan', 'place of work', 'hold of Stephan']
['process of']
[]
