# Keyword Spotting

#### Ryan Bales (@ryanbales)<br>ryan@balesofdata.com

***

### Import Packages

In [1]:
import gensim
import json

#### Define Constants

In [2]:
data_path = "data/2016_debates/"
transcription_data_path = "{}{}".format(data_path, "transcripts/")

### Define Helper Functions

In [3]:
def get_text(file_list):
    data = []
    
    for file_path in file_list:
        with open(file_path, "r") as f:
            transcript = json.load(f)
            data.append(transcript["results"]["transcripts"][0]["transcript"])
    
    return data

### Load Transcription Data

In [4]:
files = [
    "{}{}".format(transcription_data_path, "debate_1.mp3.json"),
    "{}{}".format(transcription_data_path, "debate_2.mp3.json"),
    "{}{}".format(transcription_data_path, "debate_3.mp3.json"),
    "{}{}".format(transcription_data_path, "vp_debate.mp3.json")
]
                       
docs = get_text(files)

### Preprocess Documents (Remove Stop Words and Run Stemming)

In [5]:
processed_docs = []

for doc in docs:
    clean_sentence_list = []
    
    for sentence in gensim.summarization.textcleaner.get_sentences(doc):
        sentence_clean = gensim.parsing.preprocessing.remove_stopwords(sentence)
        sentence_clean = gensim.parsing.preprocessing.stem_text(sentence_clean)
        sentence_clean = gensim.parsing.preprocessing.strip_punctuation(sentence_clean)
        sentence_clean = gensim.parsing.preprocessing.strip_multiple_whitespaces(sentence_clean)

        clean_sentence_list.append(sentence_clean)
    
    processed_docs.append(clean_sentence_list)

### Specify Keyword

In [6]:
search_phrase = "taxes"

### Run the Same Preporcessing steps

In [7]:
search_phrase = gensim.parsing.preprocessing.remove_stopwords(search_phrase)
search_phrase = gensim.parsing.preprocessing.stem_text(search_phrase)
search_phrase = gensim.parsing.preprocessing.strip_punctuation(search_phrase)
search_phrase = gensim.parsing.preprocessing.strip_multiple_whitespaces(search_phrase)

search_phrase

'tax'

### Search for Keywords in each Document

In [8]:
from collections import Counter

def search_keyword(doc, search_phrase):
    text = ' '.join(doc)
    tokens = gensim.utils.tokenize(text)
    return Counter(tokens)[search_phrase]

In [9]:
search_phrase_length = len(search_phrase.split())
doc_index = 0

for doc in processed_docs:
    doc_index += 1
    
    if search_phrase_length == 1:
        phrase_count = search_keyword(doc, search_phrase)  
    else:
        raise Exception("Unsupported Search Phrase")
    
    print("Found {} usages of '{}' in document index {}".format(phrase_count, search_phrase, doc_index))

Found 43 usages of 'tax' in document index 1
Found 32 usages of 'tax' in document index 2
Found 25 usages of 'tax' in document index 3
Found 44 usages of 'tax' in document index 4
