In [7]:
import re
from collections import Counter

# STEP 1: Load the corpus
corpus_path = r"C:\voice_search_project_2\01_data\corpus.txt"

with open(corpus_path, "r", encoding="utf-8") as f:
    corpus_lines = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(corpus_lines)} lines from corpus.")
print("Sample line:", corpus_lines[0])


Loaded 183 lines from corpus.
Sample line: We show for the first time that learning powerful representations from speech


In [8]:
# STEP 2: Define stopwords (no NLTK required)
stop_words = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
    "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself",
    "she", "her", "hers", "herself", "it", "its", "itself", "they", "them",
    "their", "theirs", "themselves", "what", "which", "who", "whom", "this",
    "that", "these", "those", "am", "is", "are", "was", "were", "be", "been",
    "being", "have", "has", "had", "having", "do", "does", "did", "doing",
    "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to",
    "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
}


In [9]:
# STEP 3: Define search function using regex
def search_corpus(transcription, corpus_lines, top_n=5):
    # Extract keywords (remove stopwords)
    tokens = re.findall(r'\b\w+\b', transcription.lower())
    keywords = [word for word in tokens if word not in stop_words]

    if not keywords:
        return []

    # Count keyword matches in each line
    matches = []
    for line in corpus_lines:
        line_lower = line.lower()
        count = sum(1 for kw in keywords if re.search(rf"\b{re.escape(kw)}\b", line_lower))
        if count > 0:
            matches.append((line, count))

    # Sort by match count
    matches.sort(key=lambda x: x[1], reverse=True)
    return [line for line, _ in matches[:top_n]]


In [10]:
# STEP 4: Test with an example transcription
transcription = "learning from speech representation"
results = search_corpus(transcription, corpus_lines)

print("\nSearch Results:")
for i, line in enumerate(results, 1):
    print(f"{i}. {line}")



Search Results:
1. We show for the first time that learning powerful representations from speech
2. listening to adults around them - a process that requires learning good representations of speech.
3. Our results show that jointly learning discrete speech units with contextualized representations
4. to speech recognition. The training accuracy of identifying the correct latent audio representation
5. better results (see Appendix F for this experiment and other ablations on various hyperparameters). We presented wav2vec 2.0, a framework for self-supervised learning of speech representations which
