# Transcript Analysis

In [18]:
import os
import re
import sys
import tqdm
import pickle
import datetime
import pandas as pd
from collections import defaultdict

def find_transcripts(author_name):
	transcript_dir_path = '_Transcripts'
	for dir in os.listdir(transcript_dir_path):
		# check if dir and if author_name is in dir
		if os.path.isdir(os.path.join(transcript_dir_path, dir)) and author_name in dir:
			print(f'Found {author_name} in {transcript_dir_path}/{dir}.')
			transcript_paths = []
			for file in os.listdir(os.path.join(transcript_dir_path, dir)):
				transcript_paths.append(os.path.join(transcript_dir_path, dir, file))
			print(f' Found {len(transcript_paths)} transcripts.')
			return transcript_paths
	print(' Transcript Path not Found author:', os.path.join(os.getcwd(), author_name))
		
def read_transcript(transcript_name, transcript_dict):
	with open(transcript_name, 'r') as f:
		print(f'  Reading: {transcript_name}')
		for line in f:
			# title is the first line of the transcript, with # at the beginning
			if line[0] == '#':
				title = line[1:].strip()
			# date is in the second line of the transcript, between [ and ]
			# url is in the second line of the transcript, between ( and )
			elif line[:2] == '**':
				date = re.search(r'\[(.*?)\]', line).group(1)
				# conver date to datetime (December 6, 2023 -> 2023-12-06)
				date = datetime.datetime.strptime(date, '%B %d, %Y').strftime('%Y-%m-%d')
				# add year, month, day to the dict
				year = date[:4]
				month = date[5:7]
				day = date[8:]
				transcript_dict[title]['year'] = year
				transcript_dict[title]['month'] = month
				transcript_dict[title]['day'] = day
				url = re.search(r'\((.*?)\)', line).group(1)
				transcript_dict[title]['url'] = url
			else:
				# add the line to the transcript (taking out leading * and other formatting including ' and ")
				transcript_dict[title]['transcript'].append(str(line[3:].strip()))
		return transcript_dict

def dict_to_pandas(transcript_dict):
	transcript_df = pd.DataFrame(columns=['title', 'year', 'month', 'day', 'url', 'transcript'])
	for title, content in transcript_dict.items():
		new_row = {'title': title, 'year': content['year'], 
						   'month': content['month'], 'day': content['day'], 
							 'url': content['url'], 'transcript': content['transcript']}
		transcript_df = pd.concat([transcript_df, pd.DataFrame([new_row])], ignore_index=True)
	# sort by year, month, day and reset the index
	transcript_df = transcript_df.sort_values(by=['year', 'month', 'day'], ascending=False, ignore_index=True)
	return transcript_df

def pickle_transcript_df(transcript_df, author_name):
	print(f'Pickling {author_name} transcript_df.')
	dataframe_dir_path = '_Dataframes'
	if not os.path.exists(dataframe_dir_path):
		os.makedirs(dataframe_dir_path)
	transcript_df_name = f'{author_name}_transcript_df.pickle'
	df_path = os.path.join(dataframe_dir_path, transcript_df_name)
	with open(df_path, 'wb') as f:
		pickle.dump(transcript_df, f)
		print(f' Pickled {transcript_df_name} in {dataframe_dir_path}/.')

def main():
	# author_name = sys.argv[1]
	author_name = 'New York Times'
	transcripts = find_transcripts(author_name)
	transcript_dict = defaultdict(lambda: defaultdict(list))
	for transcript in transcripts:
		if '.md' in transcript:
			read_transcript(transcript, transcript_dict)
	transcript_df = dict_to_pandas(transcript_dict)
	pickle_transcript_df(transcript_df, author_name)
	return transcript_df
	
transcript_df = main()

Found New York Times in _Transcripts/New York Times Podcasts.
 Found 8 transcripts.
  Reading: _Transcripts/New York Times Podcasts/20240320_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240319_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240326_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240318_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240321_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240324_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240322_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240325_New York Times Podcasts.md
Pickling New York Times transcript_df.
 Pickled New York Times_transcript_df.pickle in _Dataframes/.


In [19]:
# find all the transcripts that contain the word (i.e. 'vaccine') including upper and lower case
def filter_df_by_word(df, word):
  word = word.lower()
  # lower all words in the transcript
  df_temp = df.copy()
  df_temp['transcript'] = df['transcript'].apply(lambda x: [line.lower() for line in x])
  filtered_df_temp = df_temp[df_temp['transcript'].apply(lambda x: any([word in line for line in x]))]
  # get indices of the filtered df
  indices = filtered_df_temp.index
  # get the original df
  filtered_df = df.loc[indices]
  print(f' Found {len(filtered_df)}/{len(df)} transcripts containing the word "{word}".')
  # get the original df
  return filtered_df

word = 'Michael B'
michael_df = filter_df_by_word(transcript_df, word)
word = 'Sabrina T'
sabrina_df = filter_df_by_word(transcript_df, word)

 Found 4/8 transcripts containing the word "michael b".
 Found 4/8 transcripts containing the word "sabrina t".
