# Transcript Analysis

In [66]:
import os
import re
import sys
import tqdm
import pickle
import datetime
import pandas as pd
from collections import defaultdict

def find_transcripts(author_name):
	transcript_dir_path = '_Transcripts'
	for dir in os.listdir(transcript_dir_path):
		# check if dir and if author_name is in dir
		if os.path.isdir(os.path.join(transcript_dir_path, dir)) and author_name in dir:
			print(f'Found {author_name} in {transcript_dir_path}/{dir}.')
			transcript_paths = []
			for file in sorted(os.listdir(os.path.join(transcript_dir_path, dir)), reverse=True):
				transcript_paths.append(os.path.join(transcript_dir_path, dir, file))
			print(f' Found {len(transcript_paths)} transcripts.')
			return transcript_paths
	print(' Transcript Path not Found author:', os.path.join(os.getcwd(), author_name))
		
def read_transcript(transcript_name, transcript_dict):
	with open(transcript_name, 'r') as f:
		print(f'  Reading: {transcript_name}')
		for line in f:
			# title is the first line of the transcript, with # at the beginning
			if line[0] == '#':
				title = line[1:].strip()
			# date is in the second line of the transcript, between [ and ]
			# url is in the second line of the transcript, between ( and )
			elif line[:2] == '**':
				date = re.search(r'\[(.*?)\]', line).group(1)
				# conver date to datetime (December 6, 2023 -> 2023-12-06)
				date = datetime.datetime.strptime(date, '%B %d, %Y').strftime('%Y-%m-%d')
				# add year, month, day to the dict
				year = date[:4]
				month = date[5:7]
				day = date[8:]
				transcript_dict[title]['year'] = year
				transcript_dict[title]['month'] = month
				transcript_dict[title]['day'] = day
				# convert date to day of the week
				weekday = datetime.datetime.strptime(date, '%Y-%m-%d').strftime('%A')
				transcript_dict[title]['weekday'] = weekday
				url = re.search(r'\((.*?)\)', line).group(1)
				transcript_dict[title]['url'] = url
			else:
				# add the line to the transcript (taking out leading * and other formatting including ' and ")
				transcript_dict[title]['transcript'].append(str(line[3:].strip()))
		return transcript_dict

def dict_to_pandas(transcript_dict):
	transcript_df = pd.DataFrame(columns=['title', 'year', 'month', 'day', 'weekday', 'url', 'transcript'])
	for title, content in transcript_dict.items():
		new_row = {'title': title, 'year': content['year'], 
						   'month': content['month'], 'day': content['day'], 'weekday': content['weekday'],
							 'url': content['url'], 'transcript': content['transcript']}
		transcript_df = pd.concat([transcript_df, pd.DataFrame([new_row])], ignore_index=True)
	# sort by year, month, day and reset the index
	transcript_df = transcript_df.sort_values(by=['year', 'month', 'day'], ascending=False, ignore_index=True)
	return transcript_df

def pickle_transcript_df(transcript_df, author_name):
	print(f'Pickling {author_name} transcript_df.')
	dataframe_dir_path = '_Dataframes'
	if not os.path.exists(dataframe_dir_path):
		os.makedirs(dataframe_dir_path)
	transcript_df_name = f'{author_name}_transcript_df.pickle'
	df_path = os.path.join(dataframe_dir_path, transcript_df_name)
	with open(df_path, 'wb') as f:
		pickle.dump(transcript_df, f)
		print(f' Pickled {transcript_df_name} in {dataframe_dir_path}/.')

def main():
	# author_name = sys.argv[1]
	author_name = 'New York Times'
	transcripts = find_transcripts(author_name)
	transcript_dict = defaultdict(lambda: defaultdict(list))
	for transcript in transcripts:
		if '.md' in transcript:
			read_transcript(transcript, transcript_dict)
	transcript_df = dict_to_pandas(transcript_dict)
	pickle_transcript_df(transcript_df, author_name)
	return transcript_df
	
transcript_df = main()

Found New York Times in _Transcripts/New York Times Podcasts.
 Found 22 transcripts.
  Reading: _Transcripts/New York Times Podcasts/20240326_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240325_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240324_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240322_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240321_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240320_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240319_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240318_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240317_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240315_New York Times Podcasts.md
  Reading: _Transcripts/New York Times Podcasts/20240314_New York Times Pod

In [67]:
transcript_df

Unnamed: 0,title,year,month,day,weekday,url,transcript
0,The United States vs. the iPhone,2024,3,26,Tuesday,https://www.youtube.com/watch?v=T8ptU0-dBMU,"[From the New York Times, I'm Sabrina Tavernis..."
1,A Terrorist Attack in Russia,2024,3,25,Monday,https://www.youtube.com/watch?v=2prryDGW2GE,"[From the New York Times, I'm Sabrina Tavernes..."
2,The Sunday Read: ‘My Goldendoodle Spent a Week...,2024,3,24,Sunday,https://www.youtube.com/watch?v=3o1XQePIMS4,"[Hey, I'm Sam Apple. I'm a contributor to the ..."
3,Chuck Schumer on His Campaign to Oust Israel’s...,2024,3,22,Friday,https://www.youtube.com/watch?v=at3wpDsTpIA,"[From the New York Times, I'm Michael Bobarro...."
4,The Caitlin Clark Phenomenon,2024,3,21,Thursday,https://www.youtube.com/watch?v=ucqbkS_UWDk,"[From the New York Times, I'm Sabrina Tavernis..."
5,The Bombshell Case That Will Transform the Hou...,2024,3,20,Wednesday,https://www.youtube.com/watch?v=ZUg6HVO0o9w,"[From the New York Times, I'm Michael Bobarro...."
6,Trump’s Plan to Take Away Biden’s Biggest Adva...,2024,3,19,Tuesday,https://www.youtube.com/watch?v=vPjyfsC5aPc,"[From New York Times, I'm Michael Bobauro. Thi..."
7,Your Car May Be Spying on You,2024,3,18,Monday,https://www.youtube.com/watch?v=6_avb_9q0S8,"[From the New York Times, I'm Sabrina Tavernis..."
8,"The Sunday Read: ‘Sure, It Won an Oscar. But I...",2024,3,17,Sunday,https://www.youtube.com/watch?v=UP5EIJsuYj8,[My name is Joshua Hunt and I'm a contributor ...
9,A Journey Through Putin’s Russia,2024,3,15,Friday,https://www.youtube.com/watch?v=mrneKV-wt0A,"[From the New York Times, I'm Sabrina Tavernis..."


In [68]:
# find all the transcripts that contain the word (i.e. 'vaccine') including upper and lower case
def filter_df_by_word(df, word):
  word = word.lower()
  # lower all words in the transcript
  df_temp = df.copy()
  df_temp['transcript'] = df['transcript'].apply(lambda x: [line.lower() for line in x])
  filtered_df_temp = df_temp[df_temp['transcript'].apply(lambda x: any([word in line for line in x]))]
  # get indices of the filtered df
  indices = filtered_df_temp.index
  # get the original df
  filtered_df = df.loc[indices]
  print(f' Found {len(filtered_df)}/{len(df)} transcripts containing the word "{word}".')
  # get the original df
  return filtered_df

# Michael Borbarro episodes
word = 'Michael Bo'
michael_df = filter_df_by_word(transcript_df, word)
michael_ids = michael_df.index
# Sabrina Tavernise episodes
word = 'Sabrina Tav'
sabrina_df = filter_df_by_word(transcript_df, word)
sabrina_ids = sabrina_df.index

# Non-Michael | Non-Sabrina episodes
other_df = transcript_df.loc[~transcript_df.index.isin(michael_ids) & \
                             ~transcript_df.index.isin(sabrina_ids)]

 Found 8/22 transcripts containing the word "michael bo".
 Found 9/22 transcripts containing the word "sabrina tav".


In [69]:
michael_df

Unnamed: 0,title,year,month,day,weekday,url,transcript
3,Chuck Schumer on His Campaign to Oust Israel’s...,2024,3,22,Friday,https://www.youtube.com/watch?v=at3wpDsTpIA,"[From the New York Times, I'm Michael Bobarro...."
5,The Bombshell Case That Will Transform the Hou...,2024,3,20,Wednesday,https://www.youtube.com/watch?v=ZUg6HVO0o9w,"[From the New York Times, I'm Michael Bobarro...."
6,Trump’s Plan to Take Away Biden’s Biggest Adva...,2024,3,19,Tuesday,https://www.youtube.com/watch?v=vPjyfsC5aPc,"[From New York Times, I'm Michael Bobauro. Thi..."
10,It Sucks to Be 33,2024,3,14,Thursday,https://www.youtube.com/watch?v=f0VU9D0pFjQ,"[From New York Times, I'm Michael Bobarro. Thi..."
12,Oregon Decriminalized Drugs. Voters Now Regret...,2024,3,12,Tuesday,https://www.youtube.com/watch?v=_jkr5J8quMA,"[From the New York Times, I'm Michael Bolvaro...."
13,The Billionaires’ Secret Plan to Solve Califor...,2024,3,11,Monday,https://www.youtube.com/watch?v=cfaxqsrbkAA,"[From New York Times, I'm Michael Bobarro. Thi..."
15,The State of the Union,2024,3,8,Friday,https://www.youtube.com/watch?v=CethFEJyN_c,"[From New York Times, I'm Michael Bobauro., Th..."
16,The Miseducation of Google’s A.I.,2024,3,7,Thursday,https://www.youtube.com/watch?v=8xLe-sxJBkA,"[From New York Times, I'm Michael Bobarro. Thi..."


In [70]:
sabrina_df

Unnamed: 0,title,year,month,day,weekday,url,transcript
0,The United States vs. the iPhone,2024,3,26,Tuesday,https://www.youtube.com/watch?v=T8ptU0-dBMU,"[From the New York Times, I'm Sabrina Tavernis..."
1,A Terrorist Attack in Russia,2024,3,25,Monday,https://www.youtube.com/watch?v=2prryDGW2GE,"[From the New York Times, I'm Sabrina Tavernes..."
4,The Caitlin Clark Phenomenon,2024,3,21,Thursday,https://www.youtube.com/watch?v=ucqbkS_UWDk,"[From the New York Times, I'm Sabrina Tavernis..."
7,Your Car May Be Spying on You,2024,3,18,Monday,https://www.youtube.com/watch?v=6_avb_9q0S8,"[From the New York Times, I'm Sabrina Tavernis..."
9,A Journey Through Putin’s Russia,2024,3,15,Friday,https://www.youtube.com/watch?v=mrneKV-wt0A,"[From the New York Times, I'm Sabrina Tavernis..."
11,The Alarming Findings Inside a Mass Shooter’s ...,2024,3,13,Wednesday,https://www.youtube.com/watch?v=fHLHB-di0wM,"[From the New York Times, I'm Sabrina Tavernis..."
17,The Unhappy Voters Who Could Swing the Election,2024,3,6,Wednesday,https://www.youtube.com/watch?v=FyFgxrkK7bI,"[From the New York Times, I'm Sabrina Tavernis..."
18,A Deadly Aid Delivery and Growing Threat of Fa...,2024,3,5,Tuesday,https://www.youtube.com/watch?v=6LGmzycioCk,"[From the New York Times, I'm Sabrina Tavernis..."
21,"Biden, Trump and a Split Screen at the Texas B...",2024,3,1,Friday,https://www.youtube.com/watch?v=UnO2dD9MFZE,"[From the New York Times, I'm Sabrina Tavernis..."


In [71]:
other_df

Unnamed: 0,title,year,month,day,weekday,url,transcript
2,The Sunday Read: ‘My Goldendoodle Spent a Week...,2024,3,24,Sunday,https://www.youtube.com/watch?v=3o1XQePIMS4,"[Hey, I'm Sam Apple. I'm a contributor to the ..."
8,"The Sunday Read: ‘Sure, It Won an Oscar. But I...",2024,3,17,Sunday,https://www.youtube.com/watch?v=UP5EIJsuYj8,[My name is Joshua Hunt and I'm a contributor ...
14,The Sunday Read: ‘Can Humans Endure the Psycho...,2024,3,10,Sunday,https://www.youtube.com/watch?v=ojKhU68-L2Y,[Imagine volunteering to live on Mars for 378 ...
19,"An F.B.I. Informant, a Bombshell Claim, and an...",2024,3,4,Monday,https://www.youtube.com/watch?v=KFh7ORWhOP4,"[Let's see if we can do this., Do you want to ..."
20,The Sunday Read: ‘How Tom Sandoval Became the ...,2024,3,3,Sunday,https://www.youtube.com/watch?v=H2CqBrRAXms,"[Even if you don't watch reality TV, you've pr..."
