In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
import string
import re

import warnings
warnings.filterwarnings("error")
warnings.filterwarnings("ignore")
with warnings.catch_warnings():
     warnings.simplefilter("error")

In [2]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('stopwords')

KeyboardInterrupt: 

In [None]:
csat  = pd.ExcelFile("../csat/Csat Raw Apr'22 to Oct'22.xlsb", engine='pyxlsb')

In [None]:
# combined_rolled = []

# for month in  tqdm(csat.sheet_names):
#     temp = pd.read_excel(csat, month)
#     combined_rolled.append(temp)
    
# rolled_df = pd.concat(combined_rolled).reset_index(drop=True)

In [None]:
april_may = pd.read_excel(csat, "April & May")
june = pd.read_excel(csat, "June")
july = pd.read_excel(csat, "July")
aug_to_oct = pd.read_excel(csat, "Aug to Oct")

In [None]:
aug_to_oct.head()

In [None]:
def extract_comment(april_may, july, aug_to_oct):
    april_may_text = april_may["Comment"]
    july_text = july["Q7 - Do you have any other feedback for us?"]
    aug_to_oct_text = aug_to_oct["Q7 - Do you have any other feedback for us?"]
    
    df_combined = pd.concat([april_may_text,july_text]).reset_index(drop=True)
    df =  pd.concat([df_combined,aug_to_oct_text]).reset_index(drop=True)
    df = df.dropna().reset_index(drop=True)
    return df

In [None]:
    df = extract_comment(april_may, july, aug_to_oct)
    df

In [None]:
df_comment = df.copy()

In [None]:
print(df.isnull().sum())
print(df.shape)

print(df[0])
print(df[1])
print(df[2])
print(df[3])
print(df[4])

In [None]:
# list of Stop Words
stop_words = stopwords.words('english')
stop_words.remove("not")
print(stop_words)

In [None]:
# Remove Special Chatactors, Convert into the lower case and Stop Words & Apply Lemmatization

lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', str(df[i]))
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[1]

In [None]:
## Create TF-IDF Vectorizer Model & Get the Values as Vectors

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,1))
data = tfidf_v.fit_transform(corpus).toarray()

In [None]:
print(data.shape)
print(data)

### 1. Extract Keyword using TF-IDF

In [None]:
## Create TF-IDF Vectorizer Model for Every single words (Uni-Gram)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_v = TfidfVectorizer(max_features=6951, ngram_range=(1,2), max_df=0.1)
data = tfidf_v.fit_transform(corpus)
# print(data)

avg_unigram = data.mean(axis=0)
avg_unigram = pd.DataFrame(avg_unigram, columns=tfidf_v.get_feature_names())
avg_unigram = avg_unigram.T
avg_unigram = avg_unigram.rename(columns={0:'score'})
avg_unigram['word'] = avg_unigram.index
avg_unigram = avg_unigram.sort_values('score', ascending=False)
avg_unigram

In [None]:
## Create TF-IDF Vectorizer Model with 3 Combined words (Tri-Gram)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_v = TfidfVectorizer(ngram_range=(1,3), max_df=0.1)
data = tfidf_v.fit_transform(corpus)
avg_trigram = data.mean(axis=0)
avg_trigram = pd.DataFrame(avg_trigram, columns=tfidf_v.get_feature_names())
avg_trigram = avg_trigram.T
avg_trigram = avg_trigram.rename(columns={0:'score'})
avg_trigram['word'] = avg_trigram.index
avg_trigram = avg_trigram.sort_values('score', ascending=False)     # Values are sorted by their TF-IDF Score

In [None]:
print(avg_trigram)

In [None]:
unigram_list = avg_unigram['word'].tolist()
unigram_list

In [None]:
# Relate Unigram as Topic & Tri-Gram as Sub Topics using TF-IDF

unigram_list = avg_unigram['word'].tolist()
trigram_list = avg_trigram['word'].tolist()

def convert(lst):
    return ([item.split() for item in lst])

trigram_split = convert(trigram_list)

check = pd.DataFrame(columns=['topic', 'subtopic'])

for i in unigram_list:
    counter = 0
    for j in trigram_split:
        if counter<5 and (i==j[0] or i==j[1] or i==j[2]):
            trigram_words = " ".join(j)
            
            check = pd.concat([check, pd.concat([pd.Series(i, name='topic'), pd.Series(trigram_words, name='subtopic')],axis=1)],axis=0)
            counter=counter+1

check_new = check.groupby(['topic'], as_index=False, sort=False).agg({'subtopic': ', '.join})
check_new



In [None]:
check_new['commnets'] = df_comment

In [None]:
check_new.to_csv("../csat/extract_keyword_TFIDF.csv",index=False)

### 2. Extract Keyword using RAKE

In [None]:
from rake_nltk import Rake

r = Rake()

lst_topic = []
lst_subtopic = []

for i in corpus:
    r.extract_keywords_from_text(i)  
    key_words_dict_scores = r.get_word_degrees()  
    rankedList = r.get_ranked_phrases_with_scores()
       
    keywordTopics = []  
    keywordSubTopics = []
    
    lst_topic.append(keywordTopics)
    lst_subtopic.append(keywordSubTopics)
    
    for keyword in rankedList:
#         score = (round(keyword[0], 2))
        topic_keyword = " ".join(keyword[1].split()[:1]) 
        sub_topic_keyword = " ".join(keyword[1].split()[:5]) 
              
        keywordTopics.append(topic_keyword)
        keywordSubTopics.append(sub_topic_keyword)
        
# print(lst_subtopic)

# def listToString(keywordTopics):
#     str1 = " "
#     for ele in keywordTopics:
#         str1 += ele
#     return str1


for i in lst_subtopic:
    print(listToString(i))
  
# for i in lst_topic:
#     print(listToString(i))

### 3. Extract Keyword using TextRank with Spacy

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(round(value, 2)))
            if i > number:
                break
        
        
    def analyze(self, text, candidate_pos=['NOUN', 'PROPN'], window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [None]:
tr4w = TextRank4Keyword()

df_spacy_new = pd.DataFrame(['c1'])

for row in corpus:
    tr4w.analyze(row, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    keyword = tr4w.get_keywords()
#     keyword = tr4w.symmetrize(a)
    print(keyword)
    



In [None]:
# 4. Extract Keyword using YAKE
# 5. Extract Keyword using Spacy
# 6. Extract Keyword using TextRank

### 4. Extract Keyword using YAKE

In [None]:
import yake

In [None]:
df_yake_new = pd.DataFrame(columns=['topic', 'subtopic'])

In [None]:
for row in corpus:
    kw_extractor = yake.KeywordExtractor()
    keywords = kw_extractor.extract_keywords(row)
    
    print(keywords)




### 5. Extract Keyword using keyBERT

In [None]:
keywords_list = ['tcs',
                 'customer calls',
                 'neu coins',
                 'tata group',
                 'payment',
                 'tata neu app',
                 'cash back',
                 'fraud',
                 'customer service',
                 'croma',
                 'upi',
                 'tata neu',
                 'refund',
                 'amazon',
                 'bigbasket',
                 'app'
                ]

In [None]:
def get_comments_keyword_df(keywords_list,df):
    dictionary = {
        "keyword":[],
        "comments": []
    }
    for word in keywords:
        comments = []
        for comment in df['Comments']:
            if word in str(comment):
                comments.append(comment)
                 
        dictionary["keyword"].append(word)
        dictionary["comments"].append(comments)
        dataframe = pd.DataFrame(data=dictionary)
        dataframe = dataframe.explode('comments')


In [20]:
def fun(n):
    if n > 0:
        return n+fun(n-2)
    else:
        return 0
    
    
fun(10)

30