In [27]:
#Import Libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import nltk
import string
import pandas as pd
import re
import os 

# Download Spacy package if needed.
#python -m spacy download en_core_web_sm


# Download Tokenizer package if needed.
#nltk.download('punkt')

# Import and load NLTK
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 

In [28]:
# Configure Pandas Viewing Window
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 5000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_column',None)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', -1)

In [81]:
# Set range to search for prices within.
lower_threshold = 30
upper_threshold = 450


# Set Constants to search for.l
integers = ["1", "2", "3", "4", "5", "6", "7", "8", "9" ]

units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen"
    ]

tens = ["", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

scales = ["hundred", "thousand", "million", "billion", "trillion"]


doc_check_keys = ["a", "and", "dollars", "cents", "cent", "dollar"]



# Build dictionary with mode price of each package. 
no_package_moce_price = {'NO PRODUCTS': 4.99}

single_package_mode_price = {'VIDEO ONLY': 49.99, 'CDV ONLY': 29.99, 'XM ONLY': 30.00, 'HSD ONLY': 39.99
                            , 'XH ONLY' : 24.99
    
}

double_package_mode_price = {'HSD/XH': 39.99 , 'HSD/CDV': 59.99 , 'CDV/XH': 19.99, 'XH/XM': 24.99,
                            'VIDEO/CDV': 74.99 , 'CDV/XM': 25.00, 'VIDEO/XM': 54.99, 'HSD/XM': 39.99 , 
                            'VIDEO/XH': 29.99 , 'VIDEO/HSD': 49.99 
}

triple_package_mode_price = {'VIDEO/CDV/XH': 24.99 , 'VIDEO/HSD/XM': 89.99 , 'HSD/CDV/XM': 59.99 ,
                            'CDV/XH/XM': 24.10 , 'VIDEO/HSD/XH': 109.99 , 'HSD/XH/XM': 39.99 , 
                            'VIDEO/HSD/CDV': 119.99 , 'HSD/CDV/XH': 59.99 , 
                            'VIDEO/CDV/XM': 54.99, 'VIDEO/XH/XM': 24.99
                            
    
}

quad_package_mode_price = {'VIDEO/HSD/XH/XM': 109.99, 'VIDEO/HSD/CDV/XH': 24.99, 'HSD/CDV/XH/XM': 29.95, 
                          'VIDEO/CDV/XH/XM': 24.99, 'VIDEO/HSD/CDV/XM': 119.99
    
}

# Form large dictionary out of various packages.

master_price_dict = {19.99: 'CDV/XH',
               24.10: 'CDV/XH/XM', 
               24.99: ('XH ONLY', 'XH/XM', 'VIDEO/CDV/XH', 'VIDEO/XH/XM',
                       'VIDEO/HSD/CDV/XH', 'VIDEO/CDV/XH/XM', 'CDV/XM'), 
                29.99: ('HSD/CDV/XH/XM', 'CDV ONLY', 'VIDEO/XH', 'XM ONLY'), 
               39.99: ('HSD ONLY', 'HSD/XH', 'HSD/XM', 'HSD/XH/XM'), 
               49.99: ('VIDEO ONLY', 'VIDEO/HSD'), 
               54.99: ('VIDEO/XM', 'VIDEO/CDV/XM'), 
               59.99: ('HSD/CDV', 'HSD/CDV/XM', 'HSD/CDV/XH'), 
               74.99: 'VIDEO/CDV', 
               89.99: 'VIDEO/HSD/XM', 
               109.99: ('VIDEO/HSD/XH', 'VIDEO/HSD/XH/XM'), 
               119.99: ('VIDEO/HSD/CDV', 'VIDEO/HSD/CDV/XM')
              }
               

# Form sorted list of all potential prices. 
price_list = [19.99, 24.10, 24.99, 29.99, 39.99, 49.99, 54.99, 59.99, 74.99, 89.99, 109.99, 119.99]


# Form text transformation dictionary.
text_transformation_dic={'home security system':'home-security-system',\
     'security system':'security-system',\
     'home security':'home-security',
     'set top box':'set-top-box',\
     'xfinity mobile':'xfinity-mobile',\
     'infinity mobile':'xfinity-mobile',\
     'land line':'landline',\
     'home phone':'home-phone',\
     'hot spot':'hotspot',\
     'home phones':'home-phone',\
     'quad play':'quad-play',\
     'double play':'double-play',\
     'triple play':'triple-play',\
     'single play':'single-play',\
     'performance pro':'performance-pro',\
     'performance plus':'performance-plus',\
     'performance starter':'performance-starter',\
     'amazon prime': 'amazon-prime',\
     'sling tv':'sling-tv',\
     'direct tv':'direct-tv',\
     'high definition':'high-definition', \
     '1': 'one', \
     '2': 'two', \
     '3': 'three', \
     '4': 'four', \
     '5': 'five', \
     '6': 'six', \
     '7': 'seven', \
     '8': 'eight', \
     '9': 'nine'

                         
                        
                         
    }


# Form list of keywords that if found, will impact score of price negatively. 
estimates = ['about', 'around', 'almost', 'nearly', 'between', 'approaching']
post_num = ['bytes', 'byte', 'megabytes', 'megabyte', 'gigabytes', 'gigabyte', 
            'terabytes', 'terabyte', 'channels', 'channel', 'charge', 'credit']

# Form list of keywords that if found, will impact score of price positively. 
key_num1 = 'ninety nine'
key_num2 = 'ninety five' 
key_words1 = ['package', 'promotion', 'offer', 'offering', 'bundle', 'bundles', 'month', 
              'bundled', 'promotions', 'promotional', 'premium', 'premiums', 'equipment', 'equipments']

    
    
    
      
# Form list of keywords associated with each line of business.  
hsd=['speed','internet','megabyte','megabytes','megabits','modem','gigabytes','signal','wifi','network','router']
video=['tv','TV','channel','channels','set-top-box','dvr','hd','hbo','espn','box','boxes']
cdv=['landline','home-phone']
xm=['mobile','lines','hotspot','spot','xfinity-mobile', 'voice']
xh=['security-system','home-security-system','home-security-system','home-security','security','camera','cameras',
    'homeservices']

# Form additional categorical keywords. 
stream=['streaming','stream', 'livestreaming', 'streamlined']
ott=['netflix','hulu','roku']
hsd_tier=['blast','performance','performance-pro','performance-plus','performance-starter','gigabit']
vid_tier=['latino','dvr','hd','high-definition', 'sport', 'sports', 'sporting']
packages=['quad-play','double-play','triple-play','single-play']

# Load NLP
nlp = en_core_web_sm.load()



In [82]:
# Inputs: String, Dictionary
# Purpose: Replaces all text founds as key within dictionary with its approiate value. 
# Returns: String

def replace_all(text, dic):
    for i, j in dic.items():
        src_str  = re.compile(i, re.IGNORECASE)
        text = src_str.sub(j,text)
    return text


In [83]:
# Inputs: String, Dictionary
# Purpose: Converts alphabetical numeric representation to float. 
# Returns: Integer

def text2int(textnum, numwords={}):
    
    # Put everything in lower case, strip commas, and call missing_category_check on textnum. 
    textnum = textnum.lower()
    textnum = textnum.replace(',', '')
    textnum = missing_category_check(textnum)

    # Made True upon finding keywords from 'descriptors'
    take_ave = False 
    waiting_for_and = False
    
    # Stores values for calculating averages with 'to' and 'between' cases.
    keep = [] 

    # List of non number keywords to look for.
    descriptors = ['a', 'to', 'between','dollars', 'cents']
      
    if not numwords:   
      numwords["and"] = (1, 0)
      for idx, word in enumerate(units):
          numwords[word] = (1, idx)
      for idx, word in enumerate(tens):
          numwords[word] = (1, (idx+1) * 10 )
      for idx, word in enumerate(scales):  
          numwords[word] = (10 ** (idx * 3 or 2), 0)
        
    current = result = 0
    
    # For each word test if in 'descriptors or if a number word.'
    for word in textnum.split():
        if word not in numwords and word not in descriptors:
            continue 
            
        # 'a' in a number clause symbolizes '1'
        if word == 'a': 
            word = 'one' 
            
        # 'to' compares two values. When found will calculate average of two. 
        elif word == 'to': 
            take_ave = True 
            keep.append(current) 
            current = 0
            result = 0
            continue
            
        # 'between' compares two values in conjunction with 'and'. When found will calculate average of two. 
        elif word == 'between':
            waiting_for_and = True
            continue
        elif waiting_for_and and word == "and":
            waiting_for_and = False
            take_ave = True
            keep.append(current)
            current = 0
            result = 0
            continue
            
        # Anything after the word dollars needs to be treated as cents. 
        # Missing category_check will add cents to end on proper condition if speaker hasn't.  
        elif word == 'dollars':
            result = result + current 
            current = 0
            continue
            
        # Makes sure cents aren't being read as dollars. 
        elif word == 'cents':
            current = current * .01
            continue

        scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0
            
    # if 'to' or 'between' was found in clause, calculate average. Otherwise proceed normally.        
    if take_ave and keep[0] != 0: 
        return(result + current + sum(keep)) / (len(keep) + 1)
    else:
        return result + current



In [84]:
# Inputs: String
# Purpose: 
# Takes account for cases where speaker is speaking improperly including. 
# 1.) Not say hundred after give hundreds place 
# 2.) Saying something like thirty nine ninety nine rather than thiry nine dollars and ninety nine cents
# Returns: String

def missing_category_check(textnum):
    change_made = False
    cent_added = False
    prev_word = ''
    count = 0
    words = textnum.split()
    for word in words:
        
        # If previous word is a single digit and current word is a double digit,
        # and current word is second word in phrase treat single digit as hundred. 
        if prev_word in units and word in tens and count == 1:
            word = 'hundred ' + word
            change_made = True
            words[count] = word
        
        # If previous word is a single digit or double digit and current word is double digit, 
        # and current word is in greater postion than second, treat current phrase as cents representation. 
        elif (prev_word in units or prev_word in tens) and word in tens and count > 1:
            word = 'dollars and ' + word
            cent_added = True
            change_made = True
            words[count] = word
        prev_word = word
        count += 1

    if change_made:
        s = " ".join(words)
        if cent_added:
            s = s + " cents"
        return s
    else:
        return textnum

        

In [85]:
# Inputs: Integer, integer
# Purpose: Determines how many sentences before and after the current one we can take in for context. 
# Returns: List

def check_position(i, sentence_count):
    if i > 1:
        low = i - 2
    else: 
        low = 0
    if sentence_count - 1 - i > 1:
        high = i + 3 
    else: 
        high = sentence_count
    return([low, high])
    

In [86]:
# Inputs: String, String 
# Purpose: Gauges importanace of individual price based on price and keywords found in surround context.
# Returns: Integer

def price_importance_analysis(context, token): 
    
    # Removes all punctuation from context except for hyphens and token and splits into array.
    remove = string.punctuation.replace("-", "")
    context = context.lower().translate(str.maketrans('', '', remove))
    token = token.lower().translate(str.maketrans('', '', remove))
    
    # Default score is 0. 
    score = 0
    
    # Booleans to make sure each category only gets accounted for once. 
    estimate_found = False
    post_num_found = False
    key_words1_found = False 
    packages_found = False 
    
    # Look in context and token for above words. Token has more importance than context. 
    for word in context.split(): 
        if word in estimates and not estimate_found:
            if word in token: 
                score -= 2
            else: 
                score -= 1 
            estimate_found = True 
            continue
        if word in post_num and not post_num_found:
            if word in token: 
                score -= 2
            else: 
                score -= 1
            post_num_found = True
            continue 
        if word in key_words1 and not key_words1_found: 
            score += 2
            key_words1_found = True
            continue
        if word in packages and not packages_found:
            score += 2 
            packages_found = True
            continue

    # Ninety nine and ninety five are big indicators of package offerings. 
    if key_num1 in token or key_num2 in token:
        score += 4 
    elif key_num1 in context or key_num2 in context: 
        score += 2
    
    # Commas in token imply list of numbers. 
    if ',' in token: 
        score -= 3
    

    return score


In [87]:
# Inputs: Integer
# Purpose: Makes appricate guess from price dictionary if score is above threshold.
# Returns: Float

def make_guess(price):
    
    # Finds closest value in price list to given price. 
    closest_num = min(price_list, key=lambda x:abs(x-price))
    
    # Returns packages that are most commonly priced here. 
    return master_price_dict[closest_num]
    

In [88]:
# Inputs: String
# Purpose: Extracts prices and context from transcript. 
# Returns: List

def extract_info(transcript_sentences):
    i = 0 
    guess = ''
    max_score = -10
    transcript_price = 0 
    transcript_score = 0 
    transcript_context = ''
    transcript_label = []
    transcript_score = 0 
    transcript_context = '' 
    int_price = 0 
    sentence_count = len(transcript_sentences)
    
    # Search each sentence.
    while i < sentence_count:
        phrase = ""
        context_gauge = check_position(i, sentence_count) 
        lower_limit = context_gauge[0] 
        upper_limit = context_gauge[1]
        
        # Make text transformations. 
        for word in transcript_sentences[i].split():  
            word = word.lower().replace('.', '').replace('?', '')
            
            # If keyword found, calculate integer form and convert to integer.
            if word in units or word in tens or word in scales or word in doc_check_keys or word in integers:
                if len(phrase) == 0: 
                    phrase = word
                else: 
                    phrase = phrase + " " + word
            else: 
                if phrase != "": 
                    
                    # Remove 'a' and 'and' if found at end of phrase. 
                    if phrase[-1] in ('a', 'and'): 
                        phrase = phrase.rsplit(' ', 1)[0]
                    if word not in integers: 
                        int_price = text2int(phrase) 
                    else: 
                        int_price = int(phrase)
    
                price = int_price
                
            # If price within range of thresholds take context and do importance analysis. 
                if price > lower_threshold and price < upper_threshold: 
                    outer_text = str(transcript_sentences[lower_limit:1]) + str(transcript_sentences[i+1:upper_limit])
                    score = price_importance_analysis(transcript_sentences[i], phrase)
                    context = [transcript_sentences[lower_limit:i], transcript_sentences[i].upper(), transcript_sentences[i+1:upper_limit]]
                    if score >= max_score: 
                        max_score = score 
                        transcript_price = price
                        transcript_score = score 
                        transcript_context = context
                phrase = "" 
                price = 0
                int_price = 0
        i += 1
        
    # If score above threshold, make a guess of what package the pricing is associated with. 
    if transcript_score > 0: 
        guess = make_guess(transcript_price) 
    else: 
        guess = 'NONE' 
        
        # Prices formed with scores less than threhold are marked with "*"
        transcript_price = str(transcript_price) + "*"
    return [transcript_score, transcript_price, transcript_context, guess]
    
    

In [89]:
# Inputs: 
# Purpose: Controls file handling, text transformations, and function calling. 
# Returns: Dataframe

def main():
    rows_list = [] 
    error_count = 0 
    
    # Open necessary files
    audio_directory = "/Users/mmanle240/Downloads/nlp_git/Audio_Transcription/Amazon/Transcripts2"
    for filename in os.listdir(audio_directory): 
        if filename.endswith(".txt"):
            transcript_file_path = audio_directory + "/" + filename
            transcript_file = open(transcript_file_path, "r")
            transcript_contents = transcript_file.read().lower() 
            
            # Perform text transformations 
            transcript_contents = transcript_contents.replace(',', '')
            transcript_contents = replace_all(transcript_contents, text_transformation_dic)
            transcript_sentences = tokenizer.tokenize(transcript_contents) 
            
            # Calls rest of flow to extract dataframe. 
            data = extract_info(transcript_sentences)  
            rows_list.append(data)
            transcript_file.close() 
    df = pd.DataFrame(rows_list)
    df.columns = ['Score', 'Price', 'Context', 'Guess']
    return df


In [90]:
df = main()

In [91]:
df

Unnamed: 0,Score,Price,Context,Guess
0,1,39,"[[she's looking., picks it.], SHE SAID I'LL GIVE YOU A HUNDRED MEGABYTES PLUS FOR THIRTY NINE BECAUSE I HAVE TWENTY NINE DOLLARS A MONTH NOW., [not okay., she said she would work in about twenty minutes.]]","(HSD ONLY, HSD/XH, HSD/XM, HSD/XH/XM)"
1,0,0*,,NONE
2,6,124.99,"[[alright., so thank you again for being patient.], I DO HAVE THE FIRST FULL TYPE PACKAGE FOR ONE HUNDRED TWENTY FOUR DOLLARS AND NINETY NINE CENTS TO TWENTY FOUR MONTHS., [i backdated it to december twenty fourth two thousand eighteen., you'll get the uh digital er channel lineup up two hundred and fifty channels.]]","(VIDEO/HSD/CDV, VIDEO/HSD/CDV/XM)"
3,2,80,"[[you're working now., how how much is the offer again?], UM I TRIED TO OFFER HIM THIRTY DOLLARS BUT THEY ARE ABLE TO REVIEW THE HOUSE SIXTY TWENTY AS WELL., [but on the validation um it's really a valid charge right?, yes.]]",VIDEO/CDV
4,6,124.99,"[[when you have the customer before submitting your order i want to spend a few minutes confirming with you accurately capture the services you want., i'll ask you to review the order and then if it's correct please approve it.], YOU WANT ORDERING THE X ONE FOR FOR TRIPLE TWENTY PACKAGE FOR AN INTRODUCTORY PRICE OF ONE HUNDRED TWENTY FOUR DOLLARS AND NINETY NINE CENTS PER MONTH FOR TWENTY FOUR MONTHS., [it includes digital preferred digital preferred stars stream picks digitalized outlets s v. c. did you laugh?, i hope you have a s v c. did you live out yet?]]","(VIDEO/HSD/CDV, VIDEO/HSD/CDV/XM)"
5,1,90,"[[looks like this is going to come out quite a little bit lower than probably what you were paying before but i've got a couple of areas here that i need to clear up., so is it okay to call you back?], UM MISS E A. I'M LOOKING AT IT BEING SOMEWHERE AROUND ABOUT EIGHTY OR NINETY DOLLARS A MONTH YOU KNOW IS WHAT I'M ESTIMATING IT TO BE CAN I CALL YOU BACK?, [is it the seven?, six five six four two six two nine zero yes i'll be there alright give me about maybe five or ten minutes and i when you back let me clear this air out for me okay?]]",VIDEO/HSD/XM
6,4,120,"[[okay i want to make sure that there isn't any extra options., yeah so that would be the best one.], IT'S A HUNDRED AND TWENTY DOLLARS A MONTH UM FOR THE TRIPLE-PLAY AND LET ME SEE ONE MORE THING., [what?, okay?]]","(VIDEO/HSD/CDV, VIDEO/HSD/CDV/XM)"
7,2,70,"[[a lincoln out that he said it a also write., you start your emails or just like it made it on when you got home so we can have this for you straight.], AND I JUST WANT TO SET THE EXPECTATION THAT TO YOUR ACCOUNT IS BEING BILLED IN EVERY SEVENTY EACH MONTH YOU CAN BUY THE WE DOWNLOAD EXPANDING THE X Y ABSOLUTELY., [she could manage your wife my name and password at the same time., once again.]]",VIDEO/CDV
8,6,144.99,"[[and if you mean because right now up and checking here together lives easier., the phone service.], AND THEY SAID YOU'RE BEING I ON ONE FORTY FOUR NINETY NINE SO I CAN GIVE A PROMOTIONAL PRICE FOR OUR REFER BACK ISSUE FOR ONE., [twenty four ninety nine., this already includes.]]","(VIDEO/HSD/CDV, VIDEO/HSD/CDV/XM)"
9,0,0*,,NONE


In [97]:
output_string = "" 
context_list = df['Context'].tolist() 
for context in context_list: 
    context = str(context)
    context = context.lower()
    context = context.translate(str.maketrans('', '', string.punctuation))
    output_string = output_string + context
print(output_string)

shes looking picks it she said ill give you a hundred megabytes plus for thirty nine because i have twenty nine dollars a month now not okay she said she would work in about twenty minutesalright so thank you again for being patient i do have the first full type package for one hundred twenty four dollars and ninety nine cents to twenty four months i backdated it to december twenty fourth two thousand eighteen youll get the uh digital er channel lineup up two hundred and fifty channelsyoure working now how how much is the offer again um i tried to offer him thirty dollars but they are able to review the house sixty twenty as well but on the validation um its really a valid charge right yeswhen you have the customer before submitting your order i want to spend a few minutes confirming with you accurately capture the services you want ill ask you to review the order and then if its correct please approve it you want ordering the x one for for triple twenty package for an introductory pri

In [98]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()

output_string_split = output_string.split()
for i in range(len(output_string_split)): 
    output_string_split[i] = ps.stem(output_string_split[i])


word_count_dict = {} 
for word in output_string_split: 
    if word not in word_count_dict: 
        word_count_dict[word] = 1 
    elif word in word_count_dict: 
        word_count_dict[word] += 1 

for key, value in sorted(word_count_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

megabyt: 1
minutesalright: 1
patient: 1
first: 1
type: 1
backdat: 1
decemb: 1
fourth: 1
er: 1
channelsyour: 1
tri: 1
him: 1
abl: 1
yeswhen: 1
submit: 1
spend: 1
confirm: 1
accur: 1
captur: 1
introductori: 1
outlet: 1
laugh: 1
hope: 1
yetlook: 1
littl: 1
coupl: 1
miss: 1
somewher: 1
around: 1
zero: 1
air: 1
okayokay: 1
isnt: 1
option: 1
best: 1
tripleplay: 1
more: 1
okaya: 1
lincoln: 1
write: 1
email: 1
straight: 1
set: 1
expect: 1
buy: 1
download: 1
expand: 1
y: 1
absolut: 1
could: 1
manag: 1
wife: 1
name: 1
password: 1
againand: 1
easier: 1
phone: 1
our: 1
refer: 1
issu: 1
includestwo: 1
untold: 1
region: 1
mostli: 1
replac: 1
discount: 1
contract: 1
gone: 1
product: 1
accountyeah: 1
plug: 1
new: 1
system: 1
card: 1
should: 1
knowil: 1
teen: 1
news: 1
santa: 1
mari: 1
entertain: 1
cloudi: 1
r: 1
hour: 1
xfiniti: 1
p: 1
minton: 1
monththey: 1
veri: 1
secular: 1
even: 1
pacif: 1
quarter: 1
radio: 1
most: 1
amaz: 1
support: 1
secur: 1
mandatori: 1
consid: 1
separationso: 1
finidi: 1
ed: 

In [102]:
audio_directory = "/Users/mmanle240/Downloads/nlp_git/Audio_Transcription/Amazon/Transcripts2"
all_transcripts = ""
transcript_dict = {} 
for filename in os.listdir(audio_directory): 
    if filename.endswith(".txt"):
        transcript_file_path = audio_directory + "/" + filename
        transcript_file = open(transcript_file_path, "r")
        transcript_contents = transcript_file.read().lower().translate(str.maketrans('', '', string.punctuation))
        all_transcripts = all_transcripts + transcript_contents
all_transcripts_split = all_transcripts.split()  
for i in range(len(all_transcripts_split)): 
    all_transcripts_split[i] = ps.stem(all_transcripts_split[i])
for word in all_transcripts_split: 
    if word not in transcript_dict: 
        transcript_dict[word] = 1 
    elif word in transcript_dict: 
        transcript_dict[word] += 1
for key, value in sorted(transcript_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

boot: 1
attempt: 1
soft: 1
stabil: 1
repress: 1
achiev: 1
straighten: 1
rightthank: 1
sarah: 1
race: 1
instil: 1
pace: 1
huge: 1
cynic: 1
amp: 1
vote: 1
bridg: 1
rerun: 1
moca: 1
task: 1
result: 1
sro: 1
goin: 1
gogood: 1
jennif: 1
brize: 1
attend: 1
arculli: 1
steal: 1
tarp: 1
retent: 1
ivr: 1
morgan: 1
nawawi: 1
austin: 1
fiction: 1
frustrat: 1
writer: 1
clean: 1
changeabl: 1
lettuc: 1
basket: 1
dead: 1
proof: 1
passport: 1
absenc: 1
confid: 1
screenshot: 1
instantli: 1
momentarili: 1
voluntarili: 1
airhead: 1
pam: 1
confirmatori: 1
neighbor: 1
preview: 1
profess: 1
comedian: 1
dictat: 1
immun: 1
supervis: 1
easter: 1
chine: 1
philippin: 1
realiz: 1
manual: 1
licens: 1
novat: 1
kenneth: 1
ty: 1
defect: 1
disput: 1
valley: 1
georg: 1
troop: 1
investig: 1
ansel: 1
scf: 1
forit: 1
mix: 1
harmon: 1
van: 1
si: 1
lump: 1
xt6: 1
invis: 1
dec: 1
morbid: 1
mornin: 1
backup: 1
anag: 1
taki: 1
worn: 1
bind: 1
eah: 1
cloudier: 1
compli: 1
mortgag: 1
mikey: 1
frick: 1
lew: 1
grandpar: 1
vester: 1

In [103]:
charge_keys = ['bill', 'billing', 'billed', 'tax', 'taxes', 'taxed',
               'price', 'rate', 'rates'
              ] 

gen_package_keys = ['bundle', 'bundles',  'bundled', 'promotion', 'promotions', 'promotional', 
                    'premium', 'premiums', 'service', 'services', 'serviceable', 'unlimited', 
                    'basic', 'starter', 'extreme', 'equipment', 'equipments'
                   ]

finalization_keys = ['order', 'ordering', 'approve', 'approved', 'approval', 
                     'reviewed', 'reviewing', 'validation']

additional_xh_keys = ['homeservices']

additional_stream_keys = ['livestreaming', 'streamlined']

additional_wifi_keys = ['online', 'connection', 'connections']

additional_phone_keys = ['voice']

additional_tv_keys = ['sport', 'sports', 'sporting'] 

new_keys = charge_keys + gen_package_keys + finalization_keys + additional_stream_keys + \
           additional_wifi_keys + additional_phone_keys + additional_tv_keys 

print(new_keys)
print(len(new_keys))

['bill', 'billing', 'billed', 'tax', 'taxes', 'taxed', 'price', 'rate', 'rates', 'bundle', 'bundles', 'bundled', 'promotion', 'promotions', 'promotional', 'premium', 'premiums', 'service', 'services', 'serviceable', 'unlimited', 'basic', 'starter', 'extreme', 'equipment', 'equipments', 'order', 'ordering', 'approve', 'approved', 'approval', 'reviewed', 'reviewing', 'validation', 'livestreaming', 'streamlined', 'online', 'connection', 'connections', 'voice', 'sport', 'sports', 'sporting']
43
