# Transcript Word Counts
By: Jonathan Lo<br>
Date: 10/3/23<br>
Goal is to retrieve all transcripts and find the words with the highest counts. This is after removing unnecessary words.

## Overhead

In [1]:
# Imports
import os
import re
import pandas as pd
from glob import glob
from collections import Counter

# Stop Words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords

In [2]:
# Load ignored words
with open("ignore.txt", 'r') as fh:
    LOCAL_STOP_WORDS = fh.readlines()[0]
    LOCAL_STOP_WORDS = LOCAL_STOP_WORDS.split(',')

try:
    NLTK_STOP_WORDS = set(stopwords.words('english'))
except LookupError as e:
    import nltk
    nltk.download('stopwords')
    NLTK_STOP_WORDS = set(stopwords.words('english'))

NLTK_STOP_WORDS = [word.replace("'", '') for word in NLTK_STOP_WORDS]
ENGLISH_STOP_WORDS = [word.replace("'", '') for word in ENGLISH_STOP_WORDS]

## Analysis

In [3]:
# Fetch all transcripts
corpus = []
for iv_dir in glob("../../*/*.vtt"):
    try:
        with open(os.path.join(iv_dir), 'r') as file:
            corpus.append(file.read())
    except Exception as e:
        print(f".vtt file not found in dir {iv_dir}")


In [4]:
# Clean timestamsp and meta information
master_transcript = ""
for transcript in corpus:
    clean_transcript = re.sub(r'\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}\n', '', transcript)
    clean_transcript = re.sub(r'\n\d+\n', '\n', clean_transcript)
    clean_transcript = clean_transcript\
                        .replace('\ufeffWEBVTT', '')\
                        .replace('\n', '')\
                        .replace('<v ->', '')\
                        .replace('</v>', '')\
                        .replace("'", '')\
                        .replace(',', ' ')\
                        .replace('.', ' ')\
                        .lower()
    master_transcript += " " + clean_transcript

In [5]:
# Clean ignored words from master transcript
filtered_words = list(filter(lambda w: not w in ENGLISH_STOP_WORDS, master_transcript.split()))
filtered_words = list(filter(lambda w: not w in NLTK_STOP_WORDS, filtered_words))
filtered_words = list(filter(lambda w: not w in LOCAL_STOP_WORDS, filtered_words))


In [6]:
# Gather top word counts
word_counts = Counter(filtered_words)
df = pd.DataFrame(word_counts.most_common(75), columns=['Word', 'Count'])
df

Unnamed: 0,Word,Count
0,data,606
1,file,546
2,code,529
3,expanse,473
4,running,444
...,...,...
70,machine,92
71,hardware,92
72,version,91
73,model,90


In [7]:
# Save data
df.to_csv("transcript_word_counts.csv", index=False)