In [2]:
import os
import pandas as pd
import numpy as np
import datetime

import tqdm
import string
import re
import itertools
from collections import defaultdict

import warnings
warnings.filterwarnings("error")
warnings.filterwarnings("ignore")

In [3]:
import spacy
import textacy

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
# Read the data and drop the null values

review_df = pd.read_csv("/home/heptagon/Desktop/nps_review_analysis/review.csv")
review_df = review_df.dropna().reset_index(drop=True)
df = review_df['comments']

In [5]:
df

0      Amazon Basics Earbuds arrived in an excellent ...
1      Zero Bass. If u r a bass lover then just skip ...
2      Using it from one week and feels it's an avera...
3      Headphones are good. However Audio quality is ...
4      Well for the price is got it , it is quite an ...
                             ...                        
277    Nice product sound quality is excellent and bl...
278             Decent product, simply you can go for it
279    Sound clarity is superb. You can hear low to h...
280    Really useful for music lovers. Quality is sup...
281                                 Very very nice sound
Name: comments, Length: 282, dtype: object

In [6]:
def get_clean_data(df):

#         df = self.read_dataset()
    clean_text = []
    for i in range(0, len(df)):
        review = re.sub('[^a-zA-Z]', ' ', str(df[i]))
        review = " ".join(re.split("\s+", review, flags=re.UNICODE))

        if review != ' ':
            clean_text.append(review)

    return clean_text
    
clean_list = get_clean_data(df)
clean_list

['Amazon Basics Earbuds arrived in an excellent and typical Amazon Basics Box and it contained the Matte Charging Case along with the EarBuds which is partly Matte and partly Glossy along with a USB Type C cable and few spare ear tips The build quality is amazing as usual and feels sturdy even though I m not a huge fan of matte designed products The most important thing to note is this Earbuds are so big and bulky which might not fit into someone who are having small or medium ears I suppose Although it was light weight I felt little uncomfortable while the pTron Earbuds were of a perfect fit to my ears which was neither bulky nor long and bigger when compared this one But having changed the ear tips to smaller one provided it felt better but it was still uncomfortable due to its bulky size Features There is a lot of touch controls available to for increasing and decreasing the volume playing the next track pause and play etc But it doesn t work flawlessly It was always a hit and miss 

In [7]:
df_final = pd.DataFrame(clean_list, columns=['comments'])
df_final

Unnamed: 0,comments
0,Amazon Basics Earbuds arrived in an excellent ...
1,Zero Bass If u r a bass lover then just skip t...
2,Using it from one week and feels it s an avera...
3,Headphones are good However Audio quality is a...
4,Well for the price is got it it is quite an ba...
...,...
275,Nice product sound quality is excellent and bl...
276,Decent product simply you can go for it
277,Sound clarity is superb You can hear low to hi...
278,Really useful for music lovers Quality is supe...


In [8]:
def get_lemmatize_data(clean_list):

    # list of Stop Words Excluding (not, no)
    stop_words = stopwords.words('english')
    stop_words.remove("not")
    stop_words.remove("no")

    lemmatizer = WordNetLemmatizer()        # Apply Lemmatization
    corpus_list = []

#         data_list = self.get_clean_data()

    for i in range(0, len(clean_list)):
        review = str(clean_list[i]).lower()
        review = review.split()

        review = [lemmatizer.lemmatize(word) for word in review if not word in stop_words]
        review = ' '.join(review)
        corpus_list.append(review)

    # print(corpus_list)
    return corpus_list

key_corpus_list = get_lemmatize_data(clean_list)
key_corpus_list

['amazon basic earbuds arrived excellent typical amazon basic box contained matte charging case along earbuds partly matte partly glossy along usb type c cable spare ear tip build quality amazing usual feel sturdy even though not huge fan matte designed product important thing note earbuds big bulky might not fit someone small medium ear suppose although light weight felt little uncomfortable ptron earbuds perfect fit ear neither bulky long bigger compared one changed ear tip smaller one provided felt better still uncomfortable due bulky size feature lot touch control available increasing decreasing volume playing next track pause play etc work flawlessly always hit miss kind samsung smartphone little annoying beginning using often felt ok eventhough little delay response get used using daily basis liked sound effect used indicate earbuds establishing bluetooth connectivity also get disconnected pleasant never liked usual connected disconnected voice message one feature really like ama

Part of speech with diff patterns

In [9]:
# Get Keywords & Apply Part of Speech  

def get_keywords_phreses(key_phreses):

    nlp = spacy.load("en_core_web_sm")
    key_list_phreses = []

    for data in range(0, len(key_phreses)):
        doc = nlp(key_phreses[data])

        patterns_phrases = [[{"POS":"ADJ"}, {"POS":"NOUN"}] or [{"POS":"ADJ"}]]

        combined_phrases_pattern = textacy.extract.matches.token_matches(doc, patterns=patterns_phrases)

        key_phrase_combined = [str(combined_phrase) for combined_phrase in combined_phrases_pattern]
#         print(key_phrase_combined)
        
        combined_str = ",".join(key_phrase_combined)
        key_list_phreses.append(combined_str)

    return key_list_phreses

key_phreses_list = get_keywords_phreses(key_corpus_list)
key_phreses_list


# {"POS":"NOUN"}, {"POS":"PRON"}, {"POS":"VERB"}, {"POS":"ADV"}

['basic earbuds,spare ear,huge fan,important thing,small medium,light weight,uncomfortable ptron,perfect fit,bulky size,next track,ok eventhough,little delay,daily basis,sound effect,disconnected voice,basic earbuds,disappointing thing,pleasant experience,sound quality,happy time,long press,next episode,secondary tecno,total turn,particular buzzing,favourite song,first buzzing,general sound,good aspect,actual ease,sound quality,clear receiver,mic ptron,fair distance,manageable people,first impression,full charge,full charge,half hour,good voice,mic performance,competitive pricing,frequent disconnection,big conclusion,good build,competitive price,basic product,defective product,initial impression,common defect,backup call,valuable purchase,good deal,extended time,affordable price',
 'calm music,vocal background,clear crackling,low bass,due crackle,good sound,small bass,truke brand',
 'average product,poor performance,weak connection,strong battery',
 'audio quality,mobile call,average m

In [33]:
def get_most_keywords_list(lit):
        res = [i for i in lit if i]
        res = ",".join([str(item) for item in res]).split(',')

        temp = defaultdict(int)    # Get Most frequent 20 Keywords

        for sub in res:
            temp[sub] += 1
        out = (dict(itertools.islice(dict(temp).items(), len(res))))
        out = {k: v for k, v in sorted(
            out.items(), key=lambda item: item[1], reverse=True)}
        out = out.keys()
        return list(out)[:20]

get_most_keywords_list(key_phreses_list)

['sound quality',
 'good battery',
 'good sound',
 'basic product',
 'good product',
 'worth price',
 'light weight',
 'affordable price',
 'good quality',
 'worth money',
 'audio quality',
 'awesome product',
 'good price',
 'good bass',
 'amazing product',
 'low quality',
 'great product',
 'fit ear',
 'good earbuds',
 'worst product']

In [None]:
['sound quality', 'good battery', 'good sound', 'basic product', 'good product', 'worth price',
 'light weight', 'affordable price', 'good quality', 'worth money', 'audio quality', 'awesome product',
 'good price', 'good bass', 'amazing product', 'low quality', 'great product', 'fit ear',
 'good earbuds', 'worst product']



['sound quality', 'good battery', 'good earbuds', 'basic earbuds', 'first impression', 'full charge',
 'basic product', 'defective product', 'good sound', 'new one', 'great price', 'affordable pricesound quality',
 'spare ear', 'huge fan', 'important thing', 'small medium', 'light weight', 'uncomfortable ptron',
 'perfect fit', 'bulky size']


In [None]:
df_final['topic'] = key_phreses_list

In [None]:
df_final.to_csv("/home/heptagon/Desktop/nps_review_analysis/movie_reviews_about.csv", index=False)

In [None]:
about_lst = []
for data in key_phreses_list:
    if data != '':
        about_lst.append(data)
len(about_lst)

##### Apply Hugging face Zero-Shot-Classification 

In [None]:
from transformers import pipeline

In [None]:
# Apply Zero Shot Classifier using hugging face

def type_classifier_model(corpus_list, type_list):

    classifier = pipeline("zero-shot-classification", model='typeform/distilbert-base-uncased-mnli')
    res = classifier(sequences=corpus_list, candidate_labels=type_list, multi_class=True) 

    type_list = []
    for data in res:
        review_type = dict(zip(data['labels'], data['scores']))
        type_list.append(max(review_type, key = review_type.get))

    return type_list

In [None]:
type_list = ['appreciation', 'information', 'complaint']

type_classifier_list = type_classifier_model(about_lst, type_list)
type_classifier_list

##### Sentiment Classifier

In [None]:
# Apply Sentiment Classifier Model

def get_sentiment_classifier_model(key_corpys_list):
    sentiment_classifier = pipeline("sentiment-analysis")
    sentiment_res = sentiment_classifier(key_corpys_list)

    sentiment_label_list = []
    for i in sentiment_res:
        sentiment_label_list.append(i['label'])
    
    return sentiment_label_list

In [None]:
sentiment_classifier_list = get_sentiment_classifier_model(about_lst)
sentiment_classifier_list

In [None]:
len(sentiment_classifier_list)

In [None]:
df_final['topic'] = key_phreses_list

In [None]:
df_final.dropna().reset_index(drop=True)

In [None]:
df_final['type'] = type_classifier_list
df_final['sentiment'] = sentiment_classifier_list

In [None]:
key_phrase_df.to_csv("/workspace/nps_analysis/key_phrases.csv", index=False)