In [1]:
import os
import pandas as pd
import numpy as np
import datetime

import tqdm
import string
import re
import itertools
from collections import defaultdict

import warnings
warnings.filterwarnings("error")
warnings.filterwarnings("ignore")

In [2]:
import spacy
import textacy

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
# Read the data and drop the null values

review_df = pd.read_csv("/home/heptagon/Desktop/nps_review_analysis/dataset/review_new.csv")
review_df = review_df.dropna().reset_index(drop=True)
df = review_df['comments']
df

0      the sous quality is world class but with the s...
1      The one is very excellent, I have used many Bl...
2                       quality sound with premium price
3      I have been using it from 3 months, extensivel...
4      Review after a  day of full usagea. Bose seems...
                             ...                        
109                            Amazing clarity of sound.
110                                                 Good
111    This is my second Bose revolve after owning fi...
112    great sound , portable , good battery life , v...
113    Item provides a great sound, wish the buttons ...
Name: comments, Length: 114, dtype: object

In [7]:
def get_clean_data(df):

#         df = self.read_dataset()
    clean_text = []
    for i in range(0, len(df)):
        review = re.sub('[^a-zA-Z]', ' ', str(df[i]))
        review = " ".join(re.split("\s+", review, flags=re.UNICODE))

        if review != ' ':
            clean_text.append(review)

    return clean_text
    
clean_list = get_clean_data(df)
clean_list

['the sous quality is world class but with the sterio the mono is only you can use for background for stereo order two and then feel the experienceas it is expensive too',
 'The one is very excellent I have used many Bluetooth speakers JBL charge JBL flip Transit by Soans Harman Kardon etc pure balanced sound is what you get The problems I faced are The Bose Connect app is very poor The Bose Revolve ii does not support VOIP calls WhatsApp Zoom says Bose india Bose India customer service s response was inadequate The technical team apparently lack product knowledge ',
 'quality sound with premium price',
 'I have been using it from months extensivelyIts a portable speaker so expect it to be like one My priority was great sound quality high portability this I m glad this is not chunky at all The sound quality is great gets too loud for indoor but for outdoor it s perfect for all the trips or camping Connectivity is fine but not impressive you will feel some lack of tech somewhere but it 

In [13]:
df_final = pd.DataFrame(clean_list, columns=['comments'])
df_final

Unnamed: 0,comments
0,the sous quality is world class but with the s...
1,The one is very excellent I have used many Blu...
2,quality sound with premium price
3,I have been using it from months extensivelyIt...
4,Review after a day of full usagea Bose seems t...
...,...
109,Amazing clarity of sound
110,Good
111,This is my second Bose revolve after owning fi...
112,great sound portable good battery life very ha...


In [7]:
def get_lemmatize_data(clean_list):

    # list of Stop Words Excluding (not, no)
    stop_words = stopwords.words('english')
    stop_words.remove("not")
    stop_words.remove("no")

    lemmatizer = WordNetLemmatizer()        # Apply Lemmatization
    corpus_list = []

#         data_list = self.get_clean_data()

    for i in range(0, len(clean_list)):
        review = str(clean_list[i]).lower()
        review = review.split()

        review = [lemmatizer.lemmatize(word) for word in review if not word in stop_words]
        review = ' '.join(review)
        corpus_list.append(review)

    # print(corpus_list)
    return corpus_list

key_corpus_list = get_lemmatize_data(clean_list)
key_corpus_list

['sou quality world class sterio mono use background stereo order two feel experienceas expensive',
 'one excellent used many bluetooth speaker jbl charge jbl flip transit soans harman kardon etc pure balanced sound get problem faced bose connect app poor bose revolve ii not support voip call whatsapp zoom say bose india bose india customer service response inadequate technical team apparently lack product knowledge',
 'quality sound premium price',
 'using month extensivelyits portable speaker expect like one priority great sound quality high portability glad not chunky sound quality great get loud indoor outdoor perfect trip camping connectivity fine not impressive feel lack tech somewhere disappoints well would say required battery backup highly impressive full volume although hardly played full go full week charge listening hour daily music quality soothing get tired listening even hour degree sound also play part well look brand value top notch one fav feature tripod mount got wor

### Part of speech with diff patterns

In [8]:
# Get Keywords & Apply Part of Speech  

def get_keywords_phreses(key_phreses):

    nlp = spacy.load("en_core_web_sm")
    key_list_phreses = []

    for data in range(0, len(key_phreses)):
        doc = nlp(key_phreses[data])

        patterns_phrases = [[{"POS":"ADJ"}, {"POS":"NOUN"}] or [{"POS":"ADJ"}]]

        combined_phrases_pattern = textacy.extract.matches.token_matches(doc, patterns=patterns_phrases)

        key_phrase_combined = [str(combined_phrase) for combined_phrase in combined_phrases_pattern]
#         print(key_phrase_combined)
        
        combined_str = ",".join(key_phrase_combined)
        key_list_phreses.append(combined_str)

    return key_list_phreses

key_phreses_list = get_keywords_phreses(key_corpus_list)
key_phreses_list


# {"POS":"NOUN"}, {"POS":"PRON"}, {"POS":"VERB"}, {"POS":"ADV"}
# (token.text, '-', token.pos_, '-', token.dep_, token.ent_type_)

['',
 'balanced sound,poor bose,technical team',
 '',
 'portable speaker,sound quality,high portability,sound quality,perfect trip,full volume,full week,daily music,worth money,perfect condition,full price,individual perception',
 'full usagea,sweet spot,ii generation,heavy bass,equaliser source,sound punchyb,old song,bt connectivity,bt version,basic equilizer,timesoverall verdict,basic job',
 '',
 'sound quality',
 'best blutooth',
 'much product,worth shipment,outer box,thin plastic,rough handling,outer cardboard,poor experience,beautiful box',
 'good product,perfect box,original factory,good deal',
 'little beauty,vividh bharti,marvellous sound,transparent sound,good sound,little genius,sound clarity,transparent quality,tiny speaker,treble bass,foolish compare,bose sound',
 'twice music,loud election,sound product,worth money',
 'worthy product,excellent quality,exceptional quality',
 'sound quality,full sound',
 'clear sound',
 'satisfied speaker,bossy bose,bossy crispy,balanced au

In [24]:
def get_most_keywords_list(lit):
        res = [i for i in lit if i]
        res = ",".join([str(item) for item in res]).split(',')

        temp = defaultdict(int)    # Get Most frequent 20 Keywords

        for sub in res:
            temp[sub] += 1
        out = (dict(itertools.islice(dict(temp).items(), len(res))))
        out = {k: v for k, v in sorted(
            out.items(), key=lambda item: item[1], reverse=True)}
        out = out.keys()
#         print(list(out)[:20])
        return list(out)

get_most_keywords_list(key_phreses_list)

['sound quality',
 'portable speaker',
 'great sound',
 'bose revolve',
 'soundlink revolve',
 'bose product',
 'portable home',
 'balanced sound',
 'worth money',
 'perfect condition',
 'good sound',
 'sound clarity',
 'clear sound',
 'top notch',
 'high volume',
 'first bose',
 'bose app',
 'good battery',
 'top speaker',
 'much bass',
 'bose speaker',
 'good quality',
 'deep bass',
 'excellent product',
 'poor bose',
 'technical team',
 'high portability',
 'perfect trip',
 'full volume',
 'full week',
 'daily music',
 'full price',
 'individual perception',
 'full usagea',
 'sweet spot',
 'ii generation',
 'heavy bass',
 'equaliser source',
 'sound punchyb',
 'old song',
 'bt connectivity',
 'bt version',
 'basic equilizer',
 'timesoverall verdict',
 'basic job',
 'best blutooth',
 'much product',
 'worth shipment',
 'outer box',
 'thin plastic',
 'rough handling',
 'outer cardboard',
 'poor experience',
 'beautiful box',
 'good product',
 'perfect box',
 'original factory',
 'good

In [17]:
from itertools import combinations

nlp = spacy.load("en_core_web_md")    # Set globals


def similarity_filter(titles):
    # Remove similar titles
    all_summary_pairs = list(combinations(titles, 2))
    similar_titles = []
    for pair in all_summary_pairs:
        title1 = nlp(pair[0])
        title2 = nlp(pair[1])
        similarity = title1.similarity(title2)
        if similarity > 0.99:
            similar_titles.append(pair)

    titles_to_remove = []
    for a_title in similar_titles:
        # Get the index of the first title in the pair
        index_for_removal = titles.index(a_title[0])
        titles_to_remove.append(index_for_removal)

    # Get indices of similar titles and remove them
    similar_title_counts = set(titles_to_remove)
    similar_titles = [
        x[1] for x in enumerate(titles) if x[0] in similar_title_counts
    ]

    # Exit the recursion if there are no longer any similar titles
    if len(similar_title_counts) == 0:
        return titles

    # Continue the recursion if there are still titles to remove
    else:
        # Remove similar titles from the next input
        for title in similar_titles:
            idx = titles.index(title)
            titles.pop(idx)
            
        return similarity_filter(titles)[:20]
    
    
print(similarity_filter(get_most_keywords_list(key_phreses_list)))

['portable speaker', 'bose revolve', 'soundlink revolve', 'bose product', 'portable home', 'worth money', 'perfect condition', 'sound clarity', 'clear sound', 'top notch', 'high volume', 'first bose', 'bose app', 'good battery', 'top speaker', 'deep bass', 'poor bose', 'technical team', 'high portability', 'perfect trip']


In [None]:
"""

['portable speaker', 'bose revolve', 'soundlink revolve', 'bose product', 'portable home', 'worth money',
 'perfect condition', 'sound clarity', 'clear sound', 'top notch', 'high volume', 'first bose', 'bose app',
 'good battery', 'top speaker', 'deep bass', 'poor bose', 'technical team', 'high portability', 'perfect trip']
 
['sound quality', 'bose product', 'portable speaker', 'much bass', 'daily music', 'full price', 'worth money',
 'sound punchyb', 'high volume', 'exceptional quality', 'portable home', 'soundlink revolve', 'bose revolve',
 'top speaker', 'good deal', 'much product', 'good battery', 'first bose', 'good clarity', 'satisfied speaker']

['sound quality', 'top nouch', 'long lasting', 'mind blowing', 'day sale', 'bass and treble', 'battery life',
'portable speaker', 'soundlink revolve', 'value for money', 'phone calls', 'connect app', 'quality of sound']

"""

#### Get the keywords : expressions related to these keywords. Ex: Battery: Good, bad, slow charging etc, for watch keyword

In [12]:
tp_list = key_phreses_list.copy()

# tp_list = tp_list.replace(" ", pd.NA)
tp_list = [str(sent or None) for sent in tp_list]
tp_list

['None',
 'balanced sound,poor bose,technical team',
 'None',
 'portable speaker,sound quality,high portability,sound quality,perfect trip,full volume,full week,daily music,worth money,perfect condition,full price,individual perception',
 'full usagea,sweet spot,ii generation,heavy bass,equaliser source,sound punchyb,old song,bt connectivity,bt version,basic equilizer,timesoverall verdict,basic job',
 'None',
 'sound quality',
 'best blutooth',
 'much product,worth shipment,outer box,thin plastic,rough handling,outer cardboard,poor experience,beautiful box',
 'good product,perfect box,original factory,good deal',
 'little beauty,vividh bharti,marvellous sound,transparent sound,good sound,little genius,sound clarity,transparent quality,tiny speaker,treble bass,foolish compare,bose sound',
 'twice music,loud election,sound product,worth money',
 'worthy product,excellent quality,exceptional quality',
 'sound quality,full sound',
 'clear sound',
 'satisfied speaker,bossy bose,bossy crispy

In [13]:
df_final['topic'] = tp_list
df_final

Unnamed: 0,comments,topic
0,the sous quality is world class but with the s...,
1,The one is very excellent I have used many Blu...,"balanced sound,poor bose,technical team"
2,quality sound with premium price,
3,I have been using it from months extensivelyIt...,"portable speaker,sound quality,high portabilit..."
4,Review after a day of full usagea Bose seems t...,"full usagea,sweet spot,ii generation,heavy bas..."
...,...,...
109,Amazing clarity of sound,amazing clarity
110,Good,
111,This is my second Bose revolve after owning fi...,"bose revolve,excellent friend,black finish,gre..."
112,great sound portable good battery life very ha...,"great sound,good battery,happy product"


In [14]:
df_final = df_final.replace(to_replace='None', value=np.nan).dropna()
df_final

Unnamed: 0,comments,topic
1,The one is very excellent I have used many Blu...,"balanced sound,poor bose,technical team"
3,I have been using it from months extensivelyIt...,"portable speaker,sound quality,high portabilit..."
4,Review after a day of full usagea Bose seems t...,"full usagea,sweet spot,ii generation,heavy bas..."
6,So the clarity is crisp you can hear the sound...,sound quality
7,Costly but the best blutooth speaker You will ...,best blutooth
...,...,...
108,If you looking for portable speaker and you ha...,portable speaker
109,Amazing clarity of sound,amazing clarity
111,This is my second Bose revolve after owning fi...,"bose revolve,excellent friend,black finish,gre..."
112,great sound portable good battery life very ha...,"great sound,good battery,happy product"


In [15]:
# df_final.to_csv("/home/heptagon/Desktop/nps_review_analysis/movie_reviews_about.csv", index=False)

In [16]:
about_lst = [i for i in key_phreses_list if i]
len(about_lst)

93

In [17]:
about_lst

['balanced sound,poor bose,technical team',
 'portable speaker,sound quality,high portability,sound quality,perfect trip,full volume,full week,daily music,worth money,perfect condition,full price,individual perception',
 'full usagea,sweet spot,ii generation,heavy bass,equaliser source,sound punchyb,old song,bt connectivity,bt version,basic equilizer,timesoverall verdict,basic job',
 'sound quality',
 'best blutooth',
 'much product,worth shipment,outer box,thin plastic,rough handling,outer cardboard,poor experience,beautiful box',
 'good product,perfect box,original factory,good deal',
 'little beauty,vividh bharti,marvellous sound,transparent sound,good sound,little genius,sound clarity,transparent quality,tiny speaker,treble bass,foolish compare,bose sound',
 'twice music,loud election,sound product,worth money',
 'worthy product,excellent quality,exceptional quality',
 'sound quality,full sound',
 'clear sound',
 'satisfied speaker,bossy bose,bossy crispy,balanced auto,equalized ba

##### Apply Hugging face Zero-Shot-Classification 

In [18]:
from transformers import pipeline

In [19]:
# Apply Zero Shot Classifier using hugging face

def type_classifier_model(corpus_list, type_list):

    classifier = pipeline("zero-shot-classification", model='typeform/distilbert-base-uncased-mnli')
    res = classifier(sequences=corpus_list, candidate_labels=type_list, multi_class=True) 

    type_list = []
    for data in res:
        review_type = dict(zip(data['labels'], data['scores']))
        type_list.append(max(review_type, key = review_type.get))

    return type_list

In [20]:
type_list = ['appreciation', 'information', 'complaint']

type_classifier_list = type_classifier_model(about_lst, type_list)
type_classifier_list

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.


['appreciation',
 'information',
 'information',
 'information',
 'appreciation',
 'information',
 'information',
 'information',
 'information',
 'appreciation',
 'information',
 'information',
 'appreciation',
 'information',
 'appreciation',
 'appreciation',
 'appreciation',
 'information',
 'information',
 'complaint',
 'information',
 'appreciation',
 'information',
 'appreciation',
 'appreciation',
 'information',
 'information',
 'information',
 'appreciation',
 'information',
 'information',
 'information',
 'information',
 'appreciation',
 'appreciation',
 'information',
 'appreciation',
 'information',
 'appreciation',
 'appreciation',
 'information',
 'appreciation',
 'complaint',
 'appreciation',
 'information',
 'appreciation',
 'complaint',
 'appreciation',
 'information',
 'appreciation',
 'appreciation',
 'appreciation',
 'information',
 'appreciation',
 'appreciation',
 'appreciation',
 'information',
 'information',
 'appreciation',
 'information',
 'information',
 'i

##### Sentiment Classifier

In [21]:
# Apply Sentiment Classifier Model

def get_sentiment_classifier_model(key_corpys_list):
    sentiment_classifier = pipeline("sentiment-analysis")
    sentiment_res = sentiment_classifier(key_corpys_list)

    sentiment_label_list = []
    for i in sentiment_res:
        sentiment_label_list.append(i['label'])
    
    return sentiment_label_list

In [22]:
sentiment_classifier_list = get_sentiment_classifier_model(about_lst)
sentiment_classifier_list

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


['POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',

In [23]:
df_final['type'] = type_classifier_list
df_final['sentiment'] = sentiment_classifier_list

In [24]:
df_final

Unnamed: 0,comments,topic,type,sentiment
1,The one is very excellent I have used many Blu...,"balanced sound,poor bose,technical team",appreciation,POSITIVE
3,I have been using it from months extensivelyIt...,"portable speaker,sound quality,high portabilit...",information,POSITIVE
4,Review after a day of full usagea Bose seems t...,"full usagea,sweet spot,ii generation,heavy bas...",information,NEGATIVE
6,So the clarity is crisp you can hear the sound...,sound quality,information,POSITIVE
7,Costly but the best blutooth speaker You will ...,best blutooth,appreciation,POSITIVE
...,...,...,...,...
108,If you looking for portable speaker and you ha...,portable speaker,appreciation,POSITIVE
109,Amazing clarity of sound,amazing clarity,appreciation,POSITIVE
111,This is my second Bose revolve after owning fi...,"bose revolve,excellent friend,black finish,gre...",appreciation,POSITIVE
112,great sound portable good battery life very ha...,"great sound,good battery,happy product",appreciation,POSITIVE


In [39]:
# df_final.to_csv("/home/heptagon/Desktop/nps_review_analysis/key_phrases_new.csv", index=False)

In [46]:
my_list = ['sound quality',
 'portable speaker',
 'great sound',
 'bose revolve',
 'soundlink revolve',
 'bose product',
 'portable home',
 'balanced sound',
 'worth money',
 'perfect condition',
 'good sound',
 'sound clarity',
 'clear sound',
 'top notch',
 'high volume',
 'first bose',
 'bose app',
 'good battery',
 'top speaker',
 'much bass']

a = []
for i in range(len(my_list)):
    data = my_list[i].split()
    a.append(data[1])
a = list(set(a))
a



['sound',
 'bose',
 'product',
 'volume',
 'money',
 'app',
 'revolve',
 'speaker',
 'quality',
 'home',
 'notch',
 'condition',
 'bass',
 'clarity',
 'battery']

In [26]:
l1 = [1,9,9,8,2]
l1.append([2,3,4,5])
print(l1)

[1, 9, 9, 8, 2, [2, 3, 4, 5]]
