# Verbatim analysis (using word cloud and Ngram )

## ====================================================

#### Author : Shivani
#### last updated : 16-09-2020

## ====================================================

In [None]:
#importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_colwidth", 200)

In [None]:
#reading dataset 
data_all = pd.read_excel("Comment_analysis_lockdown.xlsx", sheet_name = "Data_lockdown")

In [None]:
data_all.info()

In [None]:
data_filt = data_all.drop_duplicates()
data_filt.info()

In [None]:
data_filt["Agent_related"].unique().tolist()

In [None]:
#filtering for comments with greater than 100 words 
data_ld = data_filt[data_filt["Agent_related"].isin(["yes","Yes"])]

#filtering for lockdown period
data_ld = data_ld[data_ld["response_phase"] == "Lockdown"]

In [None]:
data_ld.info()

In [None]:
data_ld[data_ld['csat_flag'] == 0 ]['journeynode'].value_counts()

# limit of 50 count, remove others

In [None]:
data_ld[(data_ld['csat_flag'] == 0) & (data_ld['csat_comments'].str.contains('skill')) ]['journeynode'].value_counts()


# PRE PROCESSING 

In [None]:
#dropping duplicates and saving into new variable 
documents = data_ld['csat_comments'].drop_duplicates().dropna()

In [None]:
#converting it to a dataframe 
news_df = pd.DataFrame({'document':documents})

In [None]:
#Removing all symbols etc from the comments 

# removing everything except alphabets`
news_df['clean_doc'] = news_df['document'].str.replace("[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# make all text lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [None]:
import re

In [None]:
#Correcting mis splet words
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: reduce_lengthening(x))

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Lemmatisation
# import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_df['text_lemmatized'] = news_df.clean_doc.apply(lemmatize_text)

In [None]:
news_df

# TF - IDF

In [None]:
DF = {}

processed_text = news_df["text_lemmatized"]

for i in range(len(processed_text)):
    tokens = processed_text.iloc[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

In [None]:
for i in DF:
    DF[i] = len(DF[i])
# DF

In [None]:
total_vocab = [x for x in DF]

In [None]:
N = len(news_df)
total_vocab_size = len(DF)

In [None]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [None]:
#Calculating TD IDF 

from collections import Counter

doc = 0


tf_idf = {}

for i in range(N):
    
    tokens = processed_text.iloc[i]
    print(tokens)
    
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        print(counter)
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [None]:
# #Merging the td idf according to weigths 

# alpha = 0.3

# for i in tf_idf:
#     tf_idf[i] *= alpha

In [None]:
a = pd.DataFrame(columns=['keys', 'values'])
a['keys'] = tf_idf.keys()
a['values'] = tf_idf.values()

In [None]:
a['keys'][0][1]

In [None]:
a["words"] = pd.DataFrame(a['keys'].values.tolist(), index=a.index)[1]

In [None]:
a.describe()

In [None]:
a[a["values"] > a["values"].quantile(0.90)]['words']

In [None]:
#Filter for records more than 30
data_ld_top_jn = data_ld[data_ld['journeynode'].isin(['PRE DELIVERY','REFUNDS','ORDER MODIFICATION','PAYMENTS','RETURNS',
                                               'PRE PURCHASE','OFFERS AND PROMOTIONS','POST DELIVERY'])]

## JOURNEY NODE FILTER

In [None]:
data_ld_top_jn['journeynode'].value_counts()

In [None]:
data_ld_jn = data_ld_top_jn[(data_ld_top_jn['journeynode'] == "PAYMENTS") & (data_ld_top_jn["csat_flag"] == 0)]
data_ld_jn['journeynode'].value_counts()

In [None]:
# data_ld_jn.to_csv("JN_Data\\Data_POST DELIVERY.csv",index = False)

In [None]:
#dropping duplicates and saving into new variable 
documents = data_ld_jn['csat_comments'].drop_duplicates().dropna()

In [None]:
#converting it to a dataframe 
news_df = pd.DataFrame({'document':documents})

In [None]:
#Removing all symbols etc from the comments 

# removing everything except alphabets`
news_df['clean_doc'] = news_df['document'].str.replace("[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# make all text lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [None]:
import re

In [None]:
#Correcting mis splet words
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: reduce_lengthening(x))

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Lemmatisation
# import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_df['text_lemmatized'] = news_df.clean_doc.apply(lemmatize_text)

In [None]:
news_df["tokens"] = news_df["clean_doc"].apply( lambda x : word_tokenize(x) )

In [None]:
# stemmer=PorterStemmer()

# def stem_sentences(sentence):
#     tokens = sentence.split()
#     stemmed_tokens = [stemmer.stem(token) for token in tokens]
#     return ' '.join(stemmed_tokens)

# news_df['Stem_words'] = news_df['text_lemmatized'].apply(stem_sentences)

In [None]:
news_df

In [None]:
stemmer.stem("really")

# WORD CLOUD

In [None]:
import collections
# import numpy as np
# import pandas as pd
import matplotlib.cm as cm
# import matplotlib.pyplot as plt
from matplotlib import rcParams
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

In [None]:
# news_df["text_lemmatized"]

In [None]:
#storing the journey node sentence 
list_words = news_df["text_lemmatized"]

#storing the words from tf_idf 
in_tf_idf = a[a["values"] > a["values"].quantile(0.75)]["words"].tolist()

#words are present in the top quartile of tf_idf

final_list_words = []

for i in list_words:
#     print(i)
    final_list_words.extend([word for word in i if word in in_tf_idf])

# print(final_list_words)

In [None]:
#Storing the final list of filtered words 

all_headlines = ' '.join(x.lower() for x in final_list_words )

In [None]:
stopwords = STOPWORDS
# 'flipkart'
l = ['flipkart','customer','support','service','customer care','will','executive','agent','care','issue','still','even',
    'representative','agents','consultant','till','call','problem','return', 'order', 'product','please','give','delivery',
    'reply','time','response','without','chat','resolved','resolution','feedback','team','worst','poor','deliver',
    'delivered','email','gives','payment','later','option','solve','help','good','resolve','sending','card','solution']

for i in l:
    stopwords.add(i)
    

wordcloud = WordCloud(stopwords=stopwords, background_color="white", max_words=60).generate(all_headlines)


In [None]:
rcParams['figure.figsize'] = 15, 25
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
filtered_words = [word for word in all_headlines.split() if word not in stopwords]
counted_words = collections.Counter(filtered_words)

words = []
counts = []
for letter, count in counted_words.most_common(10):
    words.append(letter)
    counts.append(count)

In [None]:

colors = cm.rainbow(np.linspace(0, 1, 10))
rcParams['figure.figsize'] = 20, 10

plt.title('Top words in the headlines vs their count')
plt.xlabel('Count')
plt.ylabel('Words')
plt.barh(words, counts, color=colors)

## METHOD 1 - NGRAM & POLARITY

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

word_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='word', stop_words=stopwords)

sparse_matrix = word_vectorizer.fit_transform(news_df['clean_doc'])

frequencies = sum(sparse_matrix).toarray()[0]

pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])

In [None]:
temp = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
temp1 = temp.sort_values( by = "frequency", ascending = False)

In [None]:
temp1[temp1['frequency'] > 1]

In [None]:
temp1.index.name = 'phrases'
temp1.reset_index(inplace=True)

In [None]:
# temp1.to_csv("Ngram_resultv2.csv", index = False)

In [None]:
df = news_df[news_df['document'].str.contains('understand')]
df = df[['document']]

In [None]:
# df = news_df

In [None]:
from textblob import TextBlob

In [None]:
df['polarity'] = df['document'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['document'].astype(str).apply(len)
df['word_count'] = df['document'].apply(lambda x: len(str(x).split()))

In [None]:
t = pd.DataFrame(df['polarity'].value_counts())


In [None]:
t.reset_index(inplace=True)

In [None]:
# t.to_csv("polarity.csv", index = False)

In [None]:
# print('5 random reviews with the highest positive sentiment polarity: \n')
# cl = df.loc[df.polarity >= 0.9, ['clean_doc']].sample(5).values
# for c in cl:
#     print(c[0])

In [None]:
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = df.loc[df.polarity == 0, ['document']].sample(3).values
for c in cl:
    print(c[0])

In [None]:
print('5 reviews with the most negative polarity: \n')
cl = df.loc[df.polarity <= -0, ['document']].sample(5).values
for c in cl:
    print(c[0])