In [1]:
import os
import pandas as pd
import numpy as np
import datetime

import string
import re

import warnings
warnings.filterwarnings("error")
warnings.filterwarnings("ignore")
with warnings.catch_warnings():
     warnings.simplefilter("error")

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
import spacy
import textacy

In [6]:
nltk.download('all')
!python3 -m spacy download en_core_web_sm

In [7]:
# Read the data and drop the null values

review_df = pd.read_csv("/workspace/nps_analysis/reviews_data.csv")
review_df = review_df.dropna().reset_index(drop=True)

In [8]:
df = review_df['CleanedText']

In [13]:
# Remove Special Chatactors, Convert into the lower case and Stop Words & Apply Lemmatization

def clean_text_and_lemmatize(df):
    stop_words = stopwords.words('english')    # list of Stop Words Excluding (not, no)
    stop_words.remove("not")
    stop_words.remove("no")

    lemmatizer = WordNetLemmatizer()        # Apply Lemmatization
    corpus_key = []
    for i in range(0, len(df)):
        review = re.sub('[^a-zA-Z]', ' ', str(df[i]))
        review = review.lower()
        review = review.split()
        
        review = [lemmatizer.lemmatize(word) for word in review if not word in stop_words]
        review = ' '.join(review)
        corpus_key.append(review)
    return corpus_key

key_corpus_list = clean_text_and_lemmatize(df)
key_corpus_list

['material really accordance brand name storage good back padding good miss laptop sleeve',
 'overall bag good day bag not good rainy day get rain cover bag',
 'good material zipper quality',
 '',
 'bought something great deal great travelling purpose rather daily useage',
 'reasonable n useful',
 'good product price',
 'super',
 'good product gift',
 'good expected weight le',
 'best best',
 'good product',
 'good product true price',
 'product goodcons no rain protection cover',
 'good',
 'mrp r price buying good quality material bagoverall good',
 'bag good capacity good travelling take even school good tear one month two month request not product choose another product please buy product event going buy take careful like travelling take school said not take district school take school get automatically tear four month aur month product please travelling buy take school',
 'travel',
 'nice school bag',
 'light weight comfortable',
 'show raincover actually water resistant material n

In [14]:
# Remove Empty String from Corpus List

def convert_empty_string_to_none(corpus_data):
    return [str(data or None) for data in corpus_data]
    
clean_corpus_list = convert_empty_string_to_none(key_corpys_list.copy())


['material really accordance brand name storage good back padding good miss laptop sleeve',
 'overall bag good day bag not good rainy day get rain cover bag',
 'good material zipper quality',
 'None',
 'bought something great deal great travelling purpose rather daily useage',
 'reasonable n useful',
 'good product price',
 'super',
 'good product gift',
 'good expected weight le',
 'best best',
 'good product',
 'good product true price',
 'product goodcons no rain protection cover',
 'good',
 'mrp r price buying good quality material bagoverall good',
 'bag good capacity good travelling take even school good tear one month two month request not product choose another product please buy product event going buy take careful like travelling take school said not take district school take school get automatically tear four month aur month product please travelling buy take school',
 'travel',
 'nice school bag',
 'light weight comfortable',
 'show raincover actually water resistant materi

Part of speech with diff patterns

In [15]:
# Get Keywords & Apply Part of Speech  

def get_keywords_phreses(key_phreses):

    nlp = spacy.load("en_core_web_sm")
    key_list_phreses = []

    for data in range(0, len(key_phreses)):
        doc = nlp(key_phreses[data])

        patterns_phrases = [[{"POS":"ADJ"}, {"POS":"NOUN"}] or [{"POS":"ADJ"}]]

        combined_phrases_pattern = textacy.extract.matches.token_matches(doc, patterns=patterns_phrases)

        key_phrase_combined = [str(combined_phrase) for combined_phrase in combined_phrases_pattern]

        combined_str = ",".join(key_phrase_combined)
        key_list_phreses.append(combined_str)

    return key_list_phreses

key_phreses_list = get_keywords_phreses(key_corpys_list)
key_phreses_list


# {"POS":"NOUN"}, {"POS":"PRON"}, {"POS":"VERB"}, {"POS":"ADV"}

['',
 'overall bag,good day,rainy day',
 'good material',
 '',
 'great deal,daily useage',
 '',
 'good product',
 '',
 'good product',
 'expected weight',
 '',
 'good product',
 'good product,true price',
 '',
 '',
 'good quality',
 'good capacity,good tear',
 '',
 'nice school',
 'light weight',
 'resistant material',
 'outer fabric,resistant water',
 'amazing product',
 'small packet',
 'good product,net way',
 '',
 'enough brand,light weight',
 'good product,good storage',
 'better bag',
 'good buy',
 '',
 'best loglasting',
 'lightweight zipper,good con,many pocket',
 'disappointed fact',
 'laptop pouch',
 '',
 'good product',
 '',
 'good performance',
 'blue bag,perfect need,easy pack,heavy textbook,great product',
 '',
 '',
 'durable bag',
 '',
 '',
 'white color',
 'overall bag,perfect choice,expensive quality',
 'kid school,good quality',
 '',
 '',
 '',
 '',
 '',
 'good price',
 'nice water',
 '',
 'good quality,available laptop',
 'good size',
 '',
 'good condition',
 '',
 'go

##### Apply Hugging face Zero-Shot-Classification 

In [16]:
from transformers import pipeline

In [38]:
# Apply Zero Shot Claddifier with hugging face

def type_classifier_model(corpus_list, type_list):

    classifier = pipeline("zero-shot-classification", model='typeform/distilbert-base-uncased-mnli')
    res = classifier(sequences=corpus_list, candidate_labels=type_list, multi_class=True) 

    type_list = []
    for data in res:
        review_type = dict(zip(data['labels'], data['scores']))
        type_list.append(max(review_type, key = review_type.get))

    return type_list

The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.


In [1]:
type_list = ['appreciation', 'information', 'complaint']

type_classifier_list = type_classifier_model(clean_corpus_list, type_list)
type_classifier_list

##### Sentiment Classifier

In [None]:
# Apply Sentiment Classifier Model

def get_sentiment_classifier_model(key_corpys_list):
    sentiment_classifier = pipeline("sentiment-analysis")
    sentiment_res = sentiment_classifier(key_corpys_list)

    sentiment_label_list = []
    for i in sentiment_res:
        sentiment_label_list.append(i['label'])
    
    return sentiment_label_list

[{'label': 'POSITIVE', 'score': 0.9834417700767517},
 {'label': 'NEGATIVE', 'score': 0.9653053879737854},
 {'label': 'POSITIVE', 'score': 0.999840497970581},
 {'label': 'POSITIVE', 'score': 0.748120903968811},
 {'label': 'POSITIVE', 'score': 0.9865512847900391},
 {'label': 'POSITIVE', 'score': 0.9989253878593445},
 {'label': 'POSITIVE', 'score': 0.9998437166213989},
 {'label': 'POSITIVE', 'score': 0.998783528804779},
 {'label': 'POSITIVE', 'score': 0.9998724460601807},
 {'label': 'POSITIVE', 'score': 0.9939724802970886},
 {'label': 'POSITIVE', 'score': 0.9998223185539246},
 {'label': 'POSITIVE', 'score': 0.9998588562011719},
 {'label': 'POSITIVE', 'score': 0.9998652935028076},
 {'label': 'NEGATIVE', 'score': 0.9762635231018066},
 {'label': 'POSITIVE', 'score': 0.9998161196708679},
 {'label': 'POSITIVE', 'score': 0.9984976053237915},
 {'label': 'NEGATIVE', 'score': 0.8609068989753723},
 {'label': 'POSITIVE', 'score': 0.9982280135154724},
 {'label': 'POSITIVE', 'score': 0.999603688716888

In [None]:
sentiment_classifier_list = get_sentiment_classifier_model(key_corpys_list)
sentiment_classifier_list

In [None]:
key_phrase_df = review_df.copy()

key_phrase_df['topic'] = key_phreses_list
key_phrase_df['type'] = type_classifier_list
key_phrase_df['sentiment'] = sentiment_classifier_list

In [None]:
key_phrase_df.to_csv("/workspace/nps_analysis/key_phrases.csv", index=False)