## Method 1:

Extract and labelling transcripts from YouTube

In [1]:
from googleapiclient.discovery import build

key_file_path = '/Users/samsonbakos/keys/YoutubeData/key.txt'

with open(key_file_path, 'r') as file:
   api_key = file.read().strip()

if api_key is None:
    print("API key not found. Please set the YOUTUBE_API_KEY environment variable.")
else:
    youtube = build('youtube', 'v3', developerKey=api_key)

    playlist_id = 'PLwn6OmTp6iV2bmo-bjqymSWIxZVHm-p_i'
    videos = []

    nextPageToken = None
    while True:
        pl_request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId=playlist_id,
            maxResults=50,
            pageToken=nextPageToken
        )

        pl_response = pl_request.execute()

        videos += [item['contentDetails']['videoId'] for item in pl_response['items']]

        nextPageToken = pl_response.get('nextPageToken')

        if not nextPageToken:
            break

    print(f"Total videos in playlist: {len(videos)}")


Total videos in playlist: 108


In [2]:
from youtube_transcript_api import YouTubeTranscriptApi

transcripts = []
for video_id in videos:
    try:
        # Fetching the transcript for each video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcripts.append((video_id, transcript))
        #  print(f"Successfully processed video {video_id}")
    except Exception as e:
        # print(f"An error occurred for video ID {video_id}")
        pass

In [3]:
type(transcripts)

list

In [4]:
type(transcripts[0])

tuple

In [5]:
transcripts[0][0]

'Q3vzQXDIRgE'

In [6]:
type(transcripts[0][1])

list

In [7]:
transcripts[0][1][0]

{'text': "what's the last thing you remember so",
 'start': 0.06,
 'duration': 4.77}

## Testing on a Single Video

In [43]:
# video_id, snippets = transcripts[1]  

# prompt_text = """
# Below are snippets from a conversation involving multiple people, including Snoop Dogg. 
# Snoop Dogg has a distinctive style, often marked by his unique slang and references. 
# Your task is to read each snippet and decide if it sounds like something Snoop Dogg would say. 
# Label snippets you believe are spoken by Snoop Dogg with 'Snoop Dogg:' and all others with 'Other:'. 
# Remember, not all snippets may be from Snoop Dogg, 
# Example:
# Snippet: "Hey, what's up?"
# Label: Other: "Hey, what's up?"
# Snippet: "Fo shizzle, just chillin'."
# Label: Snoop Dogg: "Fo shizzle, just chillin'."

# Remember, this is a conversation, so about half of the labels should be Snoop Dogg. Snoop Dogg is definitely one of the speakers. 
# Even if you're not certain the speaker is Snoop Dogg based on the language, do you best to label the transcript as just two different speakers having a conversation.
# Do NOT label every snippet as other, or every snippet as snoop dogg. Half of the labels should be Snoop Dogg, and Half Other
# Now, label the following snippets:


# """

# for index, snippet in enumerate(snippets, start=1):
#     prompt_text += f"{index}. \"{snippet['text']}\"\n"

# prompt_text += "\nPlease identify and label the snippets spoken by Snoop Dogg or by the Other speaker."


In [None]:
# from openai import OpenAI

# api_key_path = '/Users/samsonbakos/keys/OpenAI/key.txt'

# with open(api_key_path, 'r') as file:
#     api_key = file.read().strip()

# client = OpenAI(api_key=api_key)


# response = client.chat.completions.create(
#     model="gpt-3.5-turbo",
#     messages=[
#         {"role": "system", "content": prompt_text}
#     ]
# )

# try:
#     for message in response.choices:
#         print(message.message.content)
# except AttributeError:
#     print("There was an error processing the response. Please check the response format.")

## Looping Over all Videos

In [45]:
from openai import OpenAI

api_key_path = '/Users/samsonbakos/keys/OpenAI/key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip()

client = OpenAI(api_key=api_key)

processed_videos = []

for video_id, snippets in transcripts:

    prompt_text = """
    Below are snippets from a conversation involving multiple people, including Snoop Dogg. 
    Snoop Dogg has a distinctive style, often marked by his unique slang and references. 
    Your task is to read each snippet and decide if it sounds like something Snoop Dogg would say. 
    Label snippets you believe are spoken by Snoop Dogg with 'Snoop Dogg:' and all others with 'Other:'. 
    Remember, not all snippets may be from Snoop Dogg.
    
    Remember, this is a conversation, so about half of the labels should be Snoop Dogg. Snoop Dogg is definitely one of the speakers. 
    Even if you're not certain the speaker is Snoop Dogg based on the language, do your best to label the transcript as just two different speakers having a conversation.
    Do NOT label every snippet as other, or every snippet as Snoop Dogg. Half of the labels should be Snoop Dogg, and Half Other.
    
    Now, label the following snippets:
    """
    
    for index, snippet in enumerate(snippets, start=1):
        prompt_text += f"{index}. \"{snippet['text']}\"\n"

    prompt_text += "\nPlease identify and label the snippets spoken by Snoop Dogg or by the Other speaker."
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": prompt_text}
        ]
    )
    
    try:
        response_text = response.choices[0].message.content
        processed_videos.append((video_id, response_text))
    except AttributeError:
        print(f"There was an error processing the response for video {video_id}.")
        processed_videos.append((video_id, None))


In [None]:
len(processed_videos)

In [49]:
import json

processed_data = []

for video_id, labeled_snippets_text in processed_videos:
    processed_data.append({
        "video_id": video_id,
        "labeled_snippets": labeled_snippets_text  
    })


('VLiQm9Z6wFw',
 'Other: "I heard you know how to play the guitar"\nOther: "well I knew how to play the guitar new"\nOther: "and no what\'s the difference well I when"\nOther: "I was young I did it no it\'s been years"\nOther: "and years and years but I I learned how"\nOther: "to play again for the show go grab one"\nOther: "of my guitars out of my studio no I\'ll"\nSnoop Dogg: "kill the first person to move"\nOther: "[Music]"\nOther: "[Music]"\nOther: "ladies and gentlemen boys and girls you"\nSnoop Dogg: "inside the ggn news network I\'m your"\nSnoop Dogg: "host with the most Finding Nemo AKA Nemo"\nOther: "hoes and today oh my God the"\nOther: "incomparable the Beautiful the awesome"\nOther: "the scary"\nOther: "yes yes have a look it\'s Kathy Bates"\nOther: "Kathy Bates what\'s happening I\'m so"\nOther: "happy to be here with you Snoopy no it\'s"\nOther: "my pleasure I mean I\'m such a fan I\'ve"\nOther: "been a fan of yours since the first time"\nSnoop Dogg: "I seen you on screen

Overall data quality is very poor. Probably won't work this way