In [None]:
# NPS comment analysis using NLTK
# Author: Eric G. Suchanek, PhD
# (c) 2022 BestBuy, All Rights Reserved
# Confidential, do not share

In [35]:
# library imports
import pandas as pd
import matplotlib.pyplot as plt 
import re

# nltk
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

In [36]:
# these tetragrams were derived from the nps National Extract from 6/1/22
null_str = "xyxyxz"

bad_tetras = ['hard drive hard drive', 'hard drive even though', 'hard drive fix problem', \
    'hard drive told would', 'hard drive came back', 'hard drive first time', 'hard drive days later', \
    'transfer data hard drive', 'hard drive data transfer', 'new laptop hard drive']

bad_bigrams = ['hard drive', 'next day', 'even though', 'total tech', 'screen protector', 'waste time', \
    'fix problem', 'got home', 'sim card', 'told would', 'power cord', 'came back', 'make appointment', \
    'made appointment', 'solve problem']

bad_trigrams = ['external hard drive', 'new hard drive', 'old hard drive', 'hard drive back', \
    'hard drive installed', 'data hard drive', 'hard drive failing', 'hard drive salvaged', \
    'hard drive copied', 'replace hard drive', 'replacing hard drive', 'hard drive replaced', \
    'wipe hard drive', 'hard drive cloned', 'solidstate hard drive']
    
input_filename = '../data/clean/NPS_Natl_cleaned.csv'
nps_df = pd.read_csv(input_filename)

nps_comments = nps_df['NPSCommentCleaned']

det_mask = nps_df['NPS_Code'] == 0
det_df = nps_df[det_mask].copy()
det_df = det_df.fillna('xyzxzy')


In [37]:
# given an input list and list of ngrams to find, return a list representing a boolean
# mask that can be applied to a dataframe to extract the relevant fields

def find_matches(input_list, ngram_list):
    matchlist = []
    i = 0
    for sentence in input_list:
        for pattern in ngram_list:
            foundit = False
            match = re.search(pattern, sentence)
            if (match is not None):
                foundit = True
                #print(f'Found it on entry {sentence} with pattern <{pattern}>')
                break
        i+= 1   
        if (foundit):
            matchlist.append(True)
            #print(f'Found it on entry {i} <{sentence}> with pattern <{pattern}>')
        else:
            matchlist.append(False)
    return matchlist

In [38]:
det_strings = det_df['NPSCommentCleaned'].values.tolist()
bad_mask = find_matches(det_strings, bad_trigrams)
det_found = det_df[bad_mask]
det_found.to_csv('../data/clean/NPS_detractors_trigrams.csv', index=False)
print(f'Found: {det_found.shape[0]} trigram hits on a total length of {len(det_strings)}')

det_found.head()


Found: 95 trigram hits on a total length of 9206


Unnamed: 0,Location,Workforce,NPS® Breakdown,respid2,NPS_Code,NPSCommentCleaned,NPSCommentLemmatised,NPSCommentPolarity,NPSCommentSubjectivity,OverallCommentCleaned,OverallCommentLemmatised,OverallCommentPolarity,OverallCommentSubjectivity
981,1492,Precinct,Detractor,6981758,0,thought it shoddy that even though am total te...,I Thought it shoddy that even though I am a to...,0.015476,0.613095,have decided not to renew total tech additiona...,I have decided not to renew total tech . Addit...,0.5,0.525
1623,1503,Precinct,Detractor,6979722,0,both technicians including the supervisor lied...,"Both technicians, including the supervisor lie...",-0.019134,0.291814,see the first box of text this was basically o...,See the first box of text . This was basically...,-0.193333,0.535833
2188,845,Precinct,Detractor,6977887,0,because brought my computer to fix certain pro...,Because I brought my computer to fix a certain...,0.08513,0.653528,they still need more knowledge to work on comp...,They still need more knowledge to work on comp...,0.64,0.75
2938,269,Precinct,Detractor,6975084,0,my computer is running better but not great af...,My computer is running better but not great . ...,0.115783,0.513258,xyxyxz,xyxyxz,0.0,0.0
4088,431,Precinct,Detractor,6970467,0,geek squad did not know how to restore my data...,Geek Squad did not know how to restore my data...,-0.197917,0.635417,xyxyxz,xyxyxz,0.0,0.0


In [39]:
det_strings = det_df['NPSCommentCleaned'].values.tolist()
bad_mask = find_matches(det_strings, bad_bigrams)
det_found = det_df[bad_mask]
det_found.to_csv('../data/clean/NPS_detractors_bigrams.csv', index=False)
print(f'Found: {det_found.shape[0]} bigram hits on a total length of {len(det_strings)}')

det_found.head()


Found: 959 bigram hits on a total length of 9206


Unnamed: 0,Location,Workforce,NPS® Breakdown,respid2,NPS_Code,NPSCommentCleaned,NPSCommentLemmatised,NPSCommentPolarity,NPSCommentSubjectivity,OverallCommentCleaned,OverallCommentLemmatised,OverallCommentPolarity,OverallCommentSubjectivity
25,106,Precinct,Detractor,6985252,0,made an appointment to increase the ram in two...,I made an appointment to increase the RAM in t...,0.0,0.366667,made an appointment to increase the ram in two...,I made an appointment to increase the RAM in t...,0.0,0.366667
52,2516,Precinct,Detractor,6985160,0,what was original told would take only couple ...,What I was original told would take only a cou...,0.1875,0.875,when check in with the laptop was told it woul...,"When I check in with the laptop, I was told it...",0.143733,0.537948
216,522,Precinct,Detractor,6984549,0,my email icons were not on my home screen when...,My email icons were not on my Home Screen when...,0.0,0.0,call me if you want more,Call me if you want more! 218-838-8241,0.5,0.5
302,805,Precinct,Detractor,6984173,0,went to get screen protector put on my phone t...,I went to get a screen protector put on my pho...,0.193182,0.393939,xyxyxz,xyxyxz,0.0,0.0
394,254,Precinct,Detractor,6983809,0,initially all of the work requested had not be...,Initially all of the work I requested had not ...,0.177778,0.198413,dion sp turned very bad experience into,Dion (sp?) turned a very bad experience into a...,-0.91,0.866667


In [40]:
det_strings = det_df['NPSCommentCleaned'].values.tolist()
bad_mask = find_matches(det_strings, bad_tetras)
det_found = det_df[bad_mask]
det_found.to_csv('../data/clean/NPS_detractors_fourgrams.csv', index=False)
print(f'Found: {det_found.shape[0]} tetragram hits on a total length of {len(det_strings)}')

det_found.head()


Found: 3 tetragram hits on a total length of 9206


Unnamed: 0,Location,Workforce,NPS® Breakdown,respid2,NPS_Code,NPSCommentCleaned,NPSCommentLemmatised,NPSCommentPolarity,NPSCommentSubjectivity,OverallCommentCleaned,OverallCommentLemmatised,OverallCommentPolarity,OverallCommentSubjectivity
18235,57,Precinct,Detractor,6916404,0,unable to fix the problem or come up with solu...,Unable to fix the problem or come up with solu...,-0.001178,0.388047,already commented earlier in survey,Already commented earlier in survey.,0.0,0.5
33037,235,Precinct,Detractor,6861240,0,unfortunately hard drive data transfer is cert...,Unfortunately hard drive data transfer is cert...,-0.098333,0.208333,unfortunately hard drive data transfer is cert...,Unfortunately hard drive data transfer is cert...,-0.036458,0.330208
76328,414,Precinct,Detractor,6699130,0,the tech overwrote my hard drive even though t...,the tech overwrote my hard drive even though t...,0.308939,0.399242,the tech was not good the geek squad manager w...,The Tech was not good . The Geek Squad manager...,-0.35,0.6
