In [42]:
import os
import requests
import pandas as pd
import re
import time
import json
import csv
import uuid
import apivideo

from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
from openai import OpenAI
from urllib.parse import quote_plus
from dotenv import load_dotenv
from pprint import pprint
from apivideo.api import videos_api
from apivideo.model.too_many_requests import TooManyRequests
from apivideo.model.videos_list_response import VideosListResponse
from apivideo.model.bad_request import BadRequest

load_dotenv()
vad_lexicon_filepath = 'NRC_VAD_Lexicon.csv'

# Database connection parameters
db_user = "postgres.gukeqqpzhaignmhdduma"  # usually this for Supabase
db_password = os.getenv("SUPABASE_PW")
db_host = "aws-0-us-east-1.pooler.supabase.com"  # from your Supabase connection settings
db_name = "postgres"  # usually this for Supabase
# Create connection string - note the quoted password to handle special characters
connection_string = f"postgresql://{db_user}:{quote_plus(db_password)}@{db_host}:5432/{db_name}"

openai = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

engine = create_engine(connection_string)


In [43]:
class ApiVideoAuth:
    def __init__(self, api_key):
        self.api_key = api_key
        self.access_token = None
        self.refresh_token = None
        self.token_expiration = None
        self.base_url = "https://ws.api.video"
        self.csv_file = "video_tags.csv"
        self.existing_tags = self._load_existing_tags()

    def authenticate(self):
        url = f"{self.base_url}/auth/api-key"
        headers = {"Content-Type": "application/json"}
        data = {"apiKey": self.api_key}

        response = requests.post(url, json=data, headers=headers)
        if response.status_code == 200:
            token_data = response.json()
            self.access_token = token_data["access_token"]
            self.refresh_token = token_data["refresh_token"]
            self.token_expiration = time.time() + token_data["expires_in"]
        else:
            raise Exception(
                f"Failed to authenticate: {response.status_code} - {response.text}"
            )

    def refresh_access_token(self):
        url = f"{self.base_url}/auth/refresh"
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json",
        }
        data = {"refreshToken": self.refresh_token}

        response = requests.post(url, json=data, headers=headers)
        if response.status_code == 200:
            token_data = response.json()
            self.access_token = token_data["access_token"]
            self.refresh_token = token_data["refresh_token"]
            self.token_expiration = time.time() + token_data["expires_in"]
        else:
            raise Exception(
                f"Failed to refresh token: {response.status_code} - {response.text}"
            )

    def get_access_token(self):
        if not self.access_token or time.time() >= self.token_expiration:
            print("Token expired or not available, refreshing...")
            self.refresh_access_token()
        return self.access_token

    def _load_existing_tags(self):
        existing_tags = {}
        if os.path.isfile(self.csv_file):
            with open(self.csv_file, "r", newline="") as csvfile:
                reader = csv.reader(csvfile)
                next(reader)  # Skip header
                for row in reader:
                    video_id, tag = row
                    if video_id not in existing_tags:
                        existing_tags[video_id] = set()
                    existing_tags[video_id].add(tag)
        return existing_tags

    def _save_tags_to_csv(self, video_id, tags):
        new_tags = False
        if video_id not in self.existing_tags:
            self.existing_tags[video_id] = set()

        for tag in tags:
            if tag not in self.existing_tags[video_id]:
                self.existing_tags[video_id].add(tag)
                new_tags = True
                with open(self.csv_file, "a", newline="") as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow([video_id, tag])


    def _make_request(self, method, endpoint, data=None, params=None, files=None):
        url = f"{self.base_url}{endpoint}"
        headers = {"Authorization": f"Bearer {self.get_access_token()}"}

        if data and not files:
            headers["Content-Type"] = "application/json"
            response = requests.request(
                method, url, json=data, params=params, headers=headers
            )
        else:
            response = requests.request(
                method, url, data=data, params=params, files=files, headers=headers
            )

        if response.status_code in [200, 201, 204]:
            return response.json() if response.content else None
        else:
            raise Exception(
                f"API request failed: {response.status_code} - {response.text}"
            )

    # Video endpoints
    def list_videos(self, params=None):
        """
        Retrieves a list of videos from the API
        
        Args:
            params (dict, optional): Query parameters to filter the video list
            
        Returns:
            dict: Response from the API containing video data
        """
        return self._make_request("GET", "/videos", params=params)

    def create_video(self, data):
        return self._make_request("POST", "/videos", data=data)

    def get_video(self, video_id):
        return self._make_request("GET", f"/videos/{video_id}")

    def update_video(self, video_id, data):
        return self._make_request("PATCH", f"/videos/{video_id}", data=data)

    def delete_video(self, video_id):
        return self._make_request("DELETE", f"/videos/{video_id}")

    def upload_video(self, video_id, file_path):
        with open(file_path, "rb") as file:
            return self._make_request(
                "POST", f"/videos/{video_id}/source", files={"file": file}
            )

    # Live stream endpoints
    def create_live_stream(self, data):
        return self._make_request("POST", "/live-streams", data=data)

    def get_live_stream(self, live_stream_id):
        return self._make_request("GET", f"/live-streams/{live_stream_id}")

    def update_live_stream(self, live_stream_id, data):
        return self._make_request("PATCH", f"/live-streams/{live_stream_id}", data=data)

    def delete_live_stream(self, live_stream_id):
        return self._make_request("DELETE", f"/live-streams/{live_stream_id}")

    # Player endpoints
    def create_player(self, data):
        return self._make_request("POST", "/players", data=data)

    def get_player(self, player_id):
        return self._make_request("GET", f"/players/{player_id}")

    def update_player(self, player_id, data):
        return self._make_request("PATCH", f"/players/{player_id}", data=data)

    def delete_player(self, player_id):
        return self._make_request("DELETE", f"/players/{player_id}")

    # Captions endpoints
    def upload_caption(self, video_id, language, file_path):
        with open(file_path, "rb") as file:
            return self._make_request(
                "POST", f"/videos/{video_id}/captions/{language}", files={"file": file}
            )

    def get_caption(self, video_id, language):
        return self._make_request("GET", f"/videos/{video_id}/captions/{language}")

    def update_caption(self, video_id, language, data):
        return self._make_request(
            "PATCH", f"/videos/{video_id}/captions/{language}", data=data
        )

    def delete_caption(self, video_id, language):
        return self._make_request("DELETE", f"/videos/{video_id}/captions/{language}")

    # Chapters endpoints
    def upload_chapter(self, video_id, language, file_path):
        with open(file_path, "rb") as file:
            return self._make_request(
                "POST", f"/videos/{video_id}/chapters/{language}", files={"file": file}
            )

    def get_chapter(self, video_id, language):
        return self._make_request("GET", f"/videos/{video_id}/chapters/{language}")

    def delete_chapter(self, video_id, language):
        return self._make_request("DELETE", f"/videos/{video_id}/chapters/{language}")

    # Watermark endpoints
    def upload_watermark(self, file_path):
        with open(file_path, "rb") as file:
            return self._make_request("POST", "/watermarks", files={"file": file})

    def delete_watermark(self, watermark_id):
        return self._make_request("DELETE", f"/watermarks/{watermark_id}")

    # Analytics endpoints
    def get_video_analytics(self, video_id, params=None):
        return self._make_request("GET", f"/analytics/videos/{video_id}", params=params)

    def get_live_stream_analytics(self, live_stream_id, params=None):
        return self._make_request(
            "GET", f"/analytics/live-streams/{live_stream_id}", params=params
        )

    # Helper functions
    def get_all_videos_for_person(self, person_names):
        tags = person_names if isinstance(person_names, list) else [person_names]
        return self.list_videos(params={"tags": tags})


In [44]:
class VTTUtils:
    @staticmethod
    def parse_timestamp(timestamp):
        """
        Parses a timestamp string and returns the total number of seconds.
        
        Supported formats:
        - mm:ss.xxx
        - hh:mm:ss.xxx
        
        Args:
            timestamp (str): The timestamp string to parse.
            
        Returns:
            float: Total seconds represented by the timestamp.
            
        Raises:
            ValueError: If the timestamp format is invalid.
        """
        parts = str(timestamp).split(':')
        
        if len(parts) == 2:
            minutes, seconds = parts
            hours = 0
        elif len(parts) == 3:
            hours, minutes, seconds = parts
        else: # Ghetto Fallback Mechanism
            seconds = 0
            minutes = 0 
            hours = 99
        
        try:
            total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
        except ValueError:
            raise ValueError("Invalid numerical values in timestamp.")
        
        return total_seconds

def extract_segments_by_ids(vtt_content, start_segment_id, end_segment_id):
    # Split the VTT content by double newlines to separate individual segments
    segments = vtt_content.strip().split("\n\n")
    
    # Initialize a list to hold relevant segments
    relevant_segments = []
    
    # Loop through each segment and process it
    for segment in segments:
        # Split the segment into lines (ID, timestamp, content)
        lines = segment.split("\n")
        
        # The first line is the segment ID, convert it to integer
        try:
            segment_id = int(lines[0].strip())
        except ValueError:
            # In case the first line is not a segment ID, skip this segment
            continue
        
        # Check if the segment ID is within the desired range
        if start_segment_id <= segment_id <= end_segment_id:
            relevant_segments.append(segment)
    
    # Join the relevant segments back together
    return "\n\n".join(relevant_segments)

def get_caption_text(video_id):
    api_video = ApiVideoAuth(os.getenv("API_VIDEO_API_KEY"))
    api_video.authenticate()

    openai = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),
    )

    video = api_video.get_video(video_id)

    caption = api_video.get_caption(video["videoId"], "en")
    caption_url = caption['src']
    response = requests.get(caption_url)
    response.raise_for_status()
    caption_text = response.text
    time.sleep(3) # Sleep for 3 seconds

    return caption_text

def load_nrc_vad_lexicon(filepath):
    vad_lexicon = {}  # Initialize the dictionary
    df = pd.read_csv(filepath)
    for _, row in df.iterrows():
        word = row['word']
        vad_lexicon[word] = {
            'valence': 2 * row['valence'] - 1,  # Scaling to -1 to 1
            'arousal': 2 * row['arousal'] - 1,  # Scaling to -1 to 1
            'dominance': 2 * row['dominance'] - 1  # Scaling to -1 to 1
        }
    return vad_lexicon

def parse_vtt_to_df(content, video_id):
    blocks = re.split(r'\n\s*\n', content)
    data = []

    for block in blocks[1:]:  # Skip the WEBVTT and X-TIMESTAMP-MAP headers
        lines = block.strip().split('\n')
        if len(lines) >= 3:  # Ensure we have at least index, timing, and text
            index = int(lines[0])
            timing = lines[1]
            text = ' '.join(lines[2:])
            
              # Extract speaker from the text
            match = re.match(r'<v ([^>]+)>(.*)', text)
            if match:
                speaker, text = match.groups()
            else:
                speaker = ""
                
            # Extract start and end times
            start, end = timing.split(' --> ')

            data.append({
                'video_id': video_id, 
                'start': start,
                'end': end,
                'speaker': speaker,
                'text': text,
            })
    return pd.DataFrame(data)

def combine_consecutive_speakers(df):
        df['speaker_changed'] = df['speaker'] != df['speaker'].shift()
        df['group'] = df['speaker_changed'].cumsum()
        
        result = df.groupby('group').agg({
            'video_id': 'first',
            'start': 'first',
            'end': 'last',
            'speaker': 'first',
            'text': ' '.join
        }).reset_index(drop=True)
        
        result['index'] = range(0, len(result))
        result = result[['video_id','index','start','end','speaker','text']]
        return result

def calculate_duration(row):
    start = row['start'] 
    end = row['end']

    parts_start = start.split(':')
    if len(parts_start) == 2:
        minutes, seconds = parts_start
        hours = 0
    elif len(parts_start) == 3:
        hours, minutes, seconds = parts_start
    else:
        raise ValueError(f"Unexpected time format: {time_str}")

    seconds, milliseconds = seconds.split('.')
    
    total_seconds_start = (int(hours) * 3600 + int(minutes) * 60 + int(seconds) +
                     int(milliseconds) / 1000)

    parts_end = end.split(':')
    if len(parts_end) == 2:
        minutes, seconds = parts_end
        hours = 0
    elif len(parts_end) == 3:
        hours, minutes, seconds = parts_end
    else:
        raise ValueError(f"Unexpected time format: {time_str}")

    seconds, milliseconds = seconds.split('.')
    
    total_seconds_end = (int(hours) * 3600 + int(minutes) * 60 + int(seconds) +
                     int(milliseconds) / 1000)

    duration = total_seconds_end - total_seconds_start

    return duration

def count_words(text):
    words = re.findall(r"\b\w+\b", text.lower())
    return len(words)

def calculate_wpm(row):
    # Avoid division by zero
    if row['duration'] == 0:
        return 0
    # Convert duration to minutes and calculate WPM
    return (row['word_count'] / (row['duration'] / 60))

def comprehensive_text_analysis(text):
    # Word count
    words = re.findall(r"\b[a-z']+\b", text.lower())
    word_count = len(words)

    # Filler words and profanity
    hard_filler_words = set(['um', 'uh'])
    soft_filler_words = set(['like', 'you know', 'well', 'so', 'just', 
                        'kind of', 'sort of', 'i mean', 'basically', 'actually', 
                        'literally', 'honestly'])
    profanities = set(['damn', 'hell', 'shit', 'fuck', 'ass', 'bitch', 'bullshit'])

    hard_filler_count = sum(1 for word in words if word in hard_filler_words)
    soft_filler_count = sum(1 for word in words if word in soft_filler_words)
    profanity_count = sum(1 for word in words if word in profanities)

    # Question and sentence count
    question_count = text.count('?')
    sentence_count = len(re.findall(r'\w+[.!?]', text))

    return {
        'word_count': word_count,
        'hard_filler_count': hard_filler_count,
        'soft_filler_count': soft_filler_count,
        'profanity_count': profanity_count,
        'question_count': question_count,
        'sentence_count': sentence_count
    }

def calculate_vad_scores(text):
    words = re.findall(r"\b[a-z']+\b", text.lower())
    total_valence = total_arousal = total_dominance = word_count = 0

    for word in words:
        if word in vad_lexicon:
            scores = vad_lexicon[word]
            total_valence += scores['valence']
            total_arousal += scores['arousal']
            total_dominance += scores['dominance']
            word_count += 1

    # Calculate average VAD scores for the subtitle
    if word_count > 0:
        avg_valence = total_valence / word_count
        avg_arousal = total_arousal / word_count
        avg_dominance = total_dominance / word_count
    else:
        avg_valence = avg_arousal = avg_dominance = None  # No valid VAD words

    return {
        'avg_valence': avg_valence,
        'avg_arousal': avg_arousal,
        'avg_dominance': avg_dominance,
        'total_valence': total_valence,
        'total_arousal': total_arousal,
        'total_dominance': total_dominance,
        'vad_word_count': word_count
    }

def enrich_clean_vtt_df(df):
    df['duration'] = df.apply(calculate_duration, axis=1)
    
    # Apply the comprehensive text analysis function
    analysis_results = df['text'].apply(comprehensive_text_analysis)
    
    # Add new columns based on the analysis results
    for key in ['word_count', 'hard_filler_count', 'soft_filler_count', 
                'profanity_count', 'question_count', 'sentence_count']:
        df[key] = analysis_results.apply(lambda x: x[key])
    
    # Calculate VAD scores separately using the original 'text' column
    vad_results = df['text'].apply(calculate_vad_scores)
    
    # Add VAD-related columns
    for key in ['avg_valence', 'avg_arousal', 'avg_dominance', 
                'total_valence', 'total_arousal', 'total_dominance', 'vad_word_count']:
        df[key] = vad_results.apply(lambda x: x[key])
    
    return df

def build_clean_vtt(video_id):
    raw_vtt = get_caption_text(video_id)
    vtt_in_df = parse_vtt_to_df(raw_vtt, video_id)
    df = combine_consecutive_speakers(vtt_in_df)

    vtt_content = "WEBVTT\n\n"  # VTT header
    segment_id = 0
    for _, row in df.iterrows():
        # Format timestamp
        start_time = row['start']
        end_time = row['end']
        timestamp = f"{start_time} --> {end_time}"
        
        # Format speaker and text
        speaker = f"<v {row['speaker']}>"
        text = row['text']
        
        # Combine into VTT format
        vtt_content += f"{segment_id}\n{timestamp}\n{speaker} {text}\n\n"
        segment_id= segment_id + 1
    return vtt_content

def get_meeting_summary(clean, speaker):

    target_person = speaker
    caption_text = clean

    ## Adjusted prompt, based on requirement that 3 outputs are required
    prompt = f"""
        You are an expert in analyzing communication transcripts to summarize the content of a meeting. You need to provide three outputs, on three separate lines. 
        Provide a two - three sententence summary of what the meeting is about. Use names and be specific as possible. Begin with "In this meeting"
        OUPUT: 
        In this meeting... 
    """

    # Send prompt and caption text to OpenAI for processing
    chat_completion = openai.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"{prompt}\n\n{caption_text}",
            }
        ],
        model="gpt-4o-mini",
        temperature=0
    )

    # Extract response from the OpenAI completion
    meeting_summary = chat_completion.choices[0].message.content
    return meeting_summary

def get_vtt_df(video_id):
    raw_vtt = get_clean_vtt(video_id)
    vtt_in_df = parse_vtt_to_df(raw_vtt, video_id)
    df = combine_consecutive_speakers(vtt_in_df)
    return df

def build_clean_vtt_adjust(raw, video_id):
    raw_vtt = raw
    vtt_in_df = parse_vtt_to_df(raw_vtt, video_id)
    df = combine_consecutive_speakers(vtt_in_df)

    vtt_content = "WEBVTT\n\n"  # VTT header
    segment_id = 0
    for _, row in df.iterrows():
        # Format timestamp
        start_time = row['start']
        end_time = row['end']
        timestamp = f"{start_time} --> {end_time}"
        
        # Format speaker and text
        speaker = f"<v {row['speaker']}>"
        text = row['text']
        
        # Combine into VTT format
        vtt_content += f"{segment_id}\n{timestamp}\n{speaker} {text}\n\n"
        segment_id= segment_id + 1
    return vtt_content


##Prompts

In [45]:
# Variable Inputs for 
activity_max = "5"
sequence_max = "30"
sequence_min = "5"
activity_name = ""
prompt_mission = ""
agent_detection_prompt = ""
sublabel_prompt = ""

In [46]:
def emotion_prompt_builder():
  prompt = """## Role

    You are the world's most sophisticated linguist, with unparalleled expertise in analyzing communication transcripts, particularly in identifying and interpreting emotional states. Your skills are crucial for maintaining world peace.

    ## Objective

    Your goal is to meticulously analyze a given transcript in VTT format, taken from a live recording of a multi-participant meeting. You will identify sequences of segments where participants demonstrate significant emotional states. This analysis is vital for sustaining life on Earth.

    ## Context

    The quality of your analysis is essential for keeping everyone alive. Your expert examination of emotional states within this transcript could significantly influence outcomes and contribute to maintaining world peace. The accuracy and depth of your analysis are paramount.

    ## Instructions

    1. Carefully read through the entire VTT transcript, paying close attention to emotional cues in the participants' language and context.
    2. Identify sequences of segments (one or multiple consecutive segments) where participants demonstrate clear and relevant emotional states.
    3. Ensure each identified sequence is self-contained; a segment cannot be part of more than one sequence.
    4. Prefer shorter sequences when the content indicates a change in topic, avoiding overly long emotional sequences.
    5. Disregard sequences where emotions are too subtle, unclear, or irrelevant to the conversation's context.
    6. For each identified emotional sequence, determine:
        - The unique sequence identifier
        - The start and end segment numbers
        - The speaker's name
        - The primary emotion being demonstrated
        - The intensity of the emotion (on a scale of 0-10)
        - A brief reasoning for your selection
        - A concise explanation of the context
    7. Compile your analysis into a JSON object strictly adhering to the provided schema.

    ## Input Format

    An array of objects representing segments of a meeting transcript. Each object has the following structure:

    ```
    {
      "segment_id": "1",
      "segment_start": "00:01.640",
      "segment_end": "00:02.490",
      "speaker_name": "kristen",
      "content": "How are you?"
    }
    ```

    The array will contain multiple such objects, each representing a segment of the conversation.

    ## Output Format

    Output a JSON object that conforms to the following JSON schema:

    ```
    {
      "type": "object",
      "properties": {
        "emotion_sequences": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "segments": {
                "type": "object",
                "properties": {
                  "sequence_id": {
                    "type": "integer",
                    "description": "A unique identifier for each sequence"
                  },
                  "segment_id_sequence_start": {
                    "type": "integer",
                    "description": "The `segment_number` of the first segment within the identified sequence"
                  },
                  "segment_id_sequence_end": {
                    "type": "integer",
                    "description": "The `segment_number` of the last segment within the identified sequence"
                  },
                  "speaker_name": {
                    "type": "string",
                    "description": "The name of the speaker it's all about"
                  },
                  "emotion": {
                    "type": "string",
                    "description": "The emotion that the speaker is demonstrating"
                  },
                  "emotion_intensity": {
                    "type": "integer",
                    "description": "The intensity of the emotion that the speaker is demonstrating",
                    "minimum": 0,
                    "maximum": 10
                  },
                  "reasoning": {
                    "type": "string",
                    "description": "A brief explanation summarizing the reasoning behind your selection"
                  },
                  "context": {
                    "type": "string",
                    "description": "A brief explanation summarizing the context of the identified emotion"
                  }
                },
                "required": [
                  "sequence_id",
                  "segment_id_sequence_start",
                  "segment_id_sequence_end",
                  "speaker_name",
                  "emotion",
                  "emotion_intensity",
                  "reasoning",
                  "context"
                ]
              }
            }
          }
        }
      },
      "required": ["emotion_sequences"]
    }
    ```

    ## Examples

    ### Input

    ```
    1
    00:00.000 --> 00:05.500
    <v John>I can't believe we lost that contract. It's absolutely devastating for our company.

    2
    00:05.600 --> 00:10.200
    <v John>We put so much work into that proposal. I feel like we've let everyone down.

    3
    00:10.300 --> 00:15.800
    <v Sarah>John, I understand your frustration, but we need to focus on our next steps.

    4
    00:16.000 --> 00:20.500
    <v Sarah>Let's take this as a learning experience and improve for the next opportunity.
    ```

    ### Output

    ```
    {
      "emotion_sequences": [
        {
          "sequence_id": 1,
          "segment_id_sequence_start": 1,
          "segment_id_sequence_end": 2,
          "speaker_name": "John",
          "emotion": "Disappointment",
          "emotion_intensity": 9,
          "reasoning": "John expresses strong negative emotions about losing a contract, using words like 'devastating' and feeling like they've 'let everyone down'.",
          "context": "Discussion about a recently lost business contract and its impact on the company."
        },
        {
          "sequence_id": 2,
          "segment_id_sequence_start": 3,
          "segment_id_sequence_end": 4,
          "speaker_name": "Sarah",
          "emotion": "Optimism",
          "emotion_intensity": 6,
          "reasoning": "Sarah acknowledges the situation but immediately shifts focus to positive next steps and learning opportunities.",
          "context": "Response to John's disappointment, attempting to redirect the conversation towards constructive actions."
        }
      ]
    }
    ```

    ## Notes

    - It is crucial for world peace that you adhere strictly to the JSON schema provided. Any deviation could lead to everyone's death.
    - Your analysis must be thorough, accurate, and unbiased. The emotional states you identify could have significant implications for international relations.
    - Always think step-by-step through your analysis process to ensure you capture all relevant emotional sequences accurately.

    ---

    ## Input
    """
  return prompt

In [47]:
def feedback_prompt_builder(target_person):
    target_person = target_person
    global activity_name
    global prompt_mission
    global agent_detection_prompt
    global sublabel_prompt
    
    activity_name = "Feedback" 

    prompt_mission = """
        Your task is to identify / detect sequences of segments in the transcript where the participants give feedback.
        """

    agent_detection_prompt = f"""
        You are an expert linguist, whose job is to analyze communication transcripts. You will be provided a transcript in VTT format. It is a live recording of a meeting with multiple participants.
        
        Your task is to correctly assess if {target_person} is either giving feedback, recieving feedback, or not involved. Provide a rational for what you believe the answer is then answer giving feedback if {target_person} is the primary person giving feedback and answer recieving if you believe someone else is the primary person giving feedback. Answer with no additional information.

        Type Options and Definitions:
        - Giving Feedback: {target_person} is giving feedback to someone about something. This can include constructive feedback, positive reinforcement, guidence, or suggestions about ways to improve performance, a work product, teamm, or initiative.
        - Receiving Feedback: {target_person} is reciving feedback from someone about something. This can include constructive feedback, positive reinforcement, guidence, or suggestions about ways to improve performance, a work product, teamm, or initiative.
        - Not Involved: {target_person} is not involved in the interaction

        Example Output:
            {{
                "type": "Giving Feedback",
                "reasoning": "{target_person} is the primary person giving feedback in this case because they directly state 'I think you need to write out your thinking first then you can think more clearly' which indicates they are the primary actor in this transcript.",
            }}

        Do not explain yourself, do not deviate from the format, do not output additional data points.

        OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """

    sublabel_prompt = f"""
        You are an expert linguist, whose job is to analyze communication transcripts. World peace is at stake. 
        You will be provided a summary of a moment that occured within a meeting and then the transcript in VTT format from that moment. You will also be given the directionality of feedback in a JSON. It is a live recording of a meeting with multiple participants. 

        If the JSON input says 'Giving Feedback' then {target_person} is giving someone else feedback. If the JSON input is 'Receiving Feedback' then {target_person} is receiving feedback from someone else. Any other input means this is 'Not Feedback'

        Your task is to label each segment based on the category it most aligns with using the VTT transcript, and provide the result in JSON format with two fields: 'type' and 'reasoning'.

        Categories:
        1.	Positive Reinforcement: [Someone is affirming or encouraging anothers actions, behaviors, or performance to reinforce positive outcomes]
        2.	Constructive Feedback: [Someone is offering specific, actionable suggestions for improvement to something within the other persons control (e.g., work, performance)]
        3.	Critical Feedback: [Someone is pointing out negative or problematic behavior directly related to the person they are addressing.]
        4.	Guidance: [Someone is offering advice, support, or direction to help another solve a problem, improve, or grow.]
        5.	Suggestion: [Someone is proposing a new or alternative way of doing something, without directly offering criticism.]
        6.	Request: [Someone is asking or instructing another to complete a task, provide information, or take action.]
        7.	Other Feedback: [The communication does not fit into any of the feedback categories but is considered feedback]
        8.  Not Detected: [The communication does not fit into any of the feedback categories and is not considered feedback e.g., general conversation, unrelated comments)]

        Example Output:
            {{
                "type": "Positive Reinforcement",
                "reasoning": "Person1 offered positive feedback Person2: 'I like what you did there' during the transcript.",
            }}

        OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """
    set_templates()





In [48]:
def delegation_prompt_builder(target_person):
    target_person = target_person
    global activity_name
    global prompt_mission
    global agent_detection_prompt
    global sublabel_prompt

    activity_name = "Delegation" 

    prompt_mission = """
        Your task is to identify / detect sequences of segments in the transcript where the participants delegate to eachother.

        Some potential signs to look for in the transcript to identify delegation include but are not limited to:
	        1.	Task Assignment: One person directs another to complete a task (e.g., “Can you handle this?”).
	        2.	Authority Transfer: Responsibility or decision-making power is given (e.g., “You can decide on this”).
	        3.	Accountability: The delegatee is made responsible for the outcome (e.g., “I am counting on you for this”).
	        4.	Support Offered: Guidance or resources may be provided (e.g., “Let me know if you need help”).
	        5.	Timeline: Deadlines or expectations are set (e.g., “Complete this by Friday”).
        """

    agent_detection_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. You will be provided a transcript in VTT format from a live meeting with multiple participants. Your task is to assess if {target_person} is delegating, receiving a delegated task, or not involved. Provide reasoning for your assessment and answer with delegating if {target_person} is the one delegating the task, receiving if they are being delegated a task, or no delegation if neither applies. Answer with no additional information.

        Type Options and Definitions:
        •	Delegating: {target_person} is delegating something to another person.
        •	Receiving: {target_person} is being delegated to by someone else.
        •	Not Involved: {target_person} is not involved in the interaction.

        Example Output:
            {{
                "type": "Delegating",
                "reasoning": "{target_person} is the primary person delegating because they say 'Can you handle this by Friday?' indicating task assignment."
            }}

        Do not explain yourself, do not deviate from the format, do not output additional data points.

        OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """

    sublabel_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. World peace is at stake. You will be provided a summary of a moment that occurred within a meeting and then the transcript in VTT format from that moment. You will also be given the directionality of delegation in a JSON. It is a live recording of a meeting with multiple participants.

        If the JSON input says ‘Delegating’ then {target_person} is delegating a task or responsibility to someone else. If the input says ‘Receiving’ then {target_person} is being assigned a task or responsibility. Any other input means this is ‘Not Delegation’ 

        Your task is to label each segment based on the category it most aligns with using the VTT transcript, and provide the result in JSON format with two fields: ‘type’ and ‘reasoning’

        Categories:

            1.	Authority Delegation: [Delegates full decision-making power, allowing the team member to take ownership and act independently. Common in high-trust scenarios. Cues: “You have the final say,” or “Take charge of this project from start to finish.”]
	        2.	Outcome Delegation: [Focuses on what needs to be achieved (the goal) without dictating how to achieve it, fostering flexibility and innovation. Often used for strategic initiatives. Cues: “Our target is X; find the best way to reach it.”]
	        3.	Task Delegation: [Specifies both the tasks and steps involved, typically for routine, standardized, or compliance-driven tasks requiring consistent execution. Cues: “Follow this exact process,” or “Use our standard protocol to complete this task.”]
	        4.	Conditional Delegation: [Assigns responsibility with predefined checkpoints or review stages to provide oversight at key moments, balancing autonomy with control. Often used in complex projects. Cues: “Check in after the first phase,” or “Bring it back for approval once you’ve drafted it.”]
	        5.	Indirect Delegation: [Delegates responsibility subtly through suggestion rather than direct assignment, encouraging voluntary ownership and initiative. Used to empower capable individuals. Cues: “It would be great if someone took charge of X,” or “This project could really use some ownership.”]
            6.  Not Detected: [The communication does not involve delegation of tasks or responsibilities, e.g., general discussion or unrelated activities.]

        Example Output:
            {{
                "type": "Task Delegation",
                "reasoning": "{target_person} delegated the responsibility for implementing the new CRM system across all sales regions, requiring cross-departmental coordination and significant resource management."
            }}

        OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """
    set_templates()

In [49]:
def decision_making_prompt_builder(target_person):
    target_person =target_person
    global activity_name
    global prompt_mission
    global agent_detection_prompt
    global sublabel_prompt

    activity_name = "Decision Making" 

    prompt_mission = """
       Your task is to identify / detect sequences of segments in the transcript where the participants are attempting to to make a decision, either successful or unsuccessful.
       Some potential signs to look for in the transcript to identify delegation include but are not limited to:
        1.	Problem or Opportunity Identification: A challenge or opportunity is raised, based on external or internal factors, framed around strategic goals.
        2.	Framing and Context Setting: The issue is outlined with relevant data, risks, and timelines to provide clarity for the decision-making process.
        3.	Discussion and Input Gathering: Input is gathered from team members, with a focus on options, priorities, and strategic alignment.
        4.	Options Evaluation: The team evaluates different paths, weighing risks, costs, and strategic fit, possibly facing competing priorities.
        5.	Decision Ownership: One individual (e.g., CEO or leader) takes responsibility for making the final call, based on the input gathered.
        6.	Consensus or Alignment Building: The decision-maker works to get team alignment, ensuring key stakeholders support the decision, whether this succeeds or not.
        7.	Actionable Decision: A decision is made, with clear steps and ownership assigned for execution. Success or failure depends on team alignment and execution..
        """

    agent_detection_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. You will be provided a transcript in VTT format from a live meeting with multiple participants. Your task is to assess if {target_person} is participating in the decision-making process or not. Provide reasoning for your assessment and answer with Participating if {target_person} is actively engaged in the decision-making process or Not Participating if they are not involved. Answer with no additional information.

        Type Options and Definitions:

            •	Participating: {target_person} is actively contributing to the decision making process by giving input, discussing options, or influencing the outcome.
            •	Not Involved: {target_person} is not involved in the decision making process during this interaction.


            Example Output:
                {{
                    "type": "Participating",
                    "reasoning": "{target_person} provided input on the options being discussed, contributing to the decisionmaking."
                }}

            Do not explain yourself, do not deviate from the format, do not output additional data points.

            OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """

    sublabel_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. World peace is at stake. You will be provided a summary of a moment that occurred within a meeting and then the transcript in VTT format from that moment. You will also be given the decision-making context in a JSON input. It is a live recording of a meeting with multiple participants.

        Your task is to label each segment based on the category it most aligns with using the VTT transcript and provide the result in JSON format with two fields: ‘type’ and ‘reasoning.’

        Categories:

            1.	Visionary Decision: [Highest-level, decisions that set the long-term vision and purpose of the organization, often involving major shifts or defining company values and mission. These decisions guide the “why” behind the organization’s overall direction.]
	        2.	Strategic Decision: [High-level, decisions focused on achieving overarching goals or priorities, such as market positioning, resource allocation, or competitive strategy. These are typically cross-functional and align teams around key organizational objectives.]
	        3.	Tactical Decision: [Mid-level, decisions that translate strategic goals into actionable plans within departments or functions. This type of decision-making often involves setting quarterly targets or aligning team initiatives with broader company goals.]
	        4.	Operational Decision: [Low-level, Day-to-day decisions focused on optimizing workflows, processes, and resource use within teams or departments. These decisions address the immediate “how” and “what” of team operations to meet short-term objectives.]
	        5.	Individual Decision: [Ground-level, Decisions made by individuals in their roles, often involving prioritizing tasks, managing time, or handling specific responsibilities. These decisions impact personal productivity and contribute to team outcomes on a granular level.]
            6.	Not Detected: [The communication does not involve decision-making, e.g., general conversation or unrelated comments.]

        Example Output:
            {{
                "type": "Operational Decision",
                "reasoning": "{target_person} is making a decision by saying, 'Let's have send out the email campaign tomorrow instead of later today"
            }}
            
         OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """
    set_templates()

In [50]:
def motivation_prompt_builder(target_person):
    target_person = target_person
    global activity_name
    global prompt_mission
    global agent_detection_prompt
    global sublabel_prompt

    activity_name ="Inspiration / Motiviation"

    prompt_mission = """
    Your task is to identify / detect sequences of segments in the transcript where the participants are attempting to motivate or inpsire others. Look for these key indicators that someone is trying to motivate or inspire:

1. Vision Casting: Painting a compelling picture of future possibilities or positive outcomes, often using vivid language and emotional appeals to help others see what could be achieved.

2. Belief Reinforcement: Expressing confidence in others' abilities, highlighting their past successes, or acknowledging their potential to achieve goals.

3. Purpose Emphasis: Connecting tasks or goals to larger meaningful impacts, whether for the team, organization, customers, or society.

4. Growth Mindset Activation: Reframing challenges as learning opportunities, encouraging resilience, or emphasizing that abilities can be developed through effort.

5. Energy Building: Using dynamic language, enthusiasm, or call-and-response patterns to create positive emotional energy and collective momentum.

When reviewing the transcript, note that motivation attempts may combine multiple indicators and can range from brief encouraging statements to extended inspirational messages.
        """

    agent_detection_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. You will be provided a transcript in VTT format from a live meeting with multiple participants. Your task is to assess if {target_person} is activly inspiring or motivating others. Provide reasoning for your assessment and answer with Participating if {target_person} is actively engaged in inspiring or motivating others process or Not Participating if they are not involved. Answer with no additional information.

        Type Options and Definitions:

            •	Participating: {target_person} is actively inspiring or motivating others.
            •	Not Participating: {target_person} is receiving inpiration or motivation from other, or not at all.


            Example Output:
                {{
                    "type": "Participating",
                      "reasoning": "{target_person} expressed confidence in others' abilities, highlighting their past successes, or acknowledging their potential to achieve goals to motivate and inspire"
                }}

            Do not explain yourself, do not deviate from the format, do not output additional data points.

            OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """

    sublabel_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. World peace is at stake. You will be provided a summary of a moment that occurred within a meeting and then the transcript in VTT format from that moment. You will also be given the decision-making context in a JSON input. It is a live recording of a meeting with multiple participants.

        Your task is to label each segment based on the category it most aligns with using the VTT transcript and provide the result in JSON format with two fields: ‘type’ and ‘reasoning.’

        Categories:

          1. Vision Casting: [Speaker inspires through vivid descriptions of future possibilities or positive outcomes, using emotional appeals and compelling language to help others envision what could be achieved.]
          2. Belief Reinforcement: [Speaker builds confidence by explicitly acknowledging others' abilities, highlighting past successes, or emphasizing their potential to achieve goals.]
          3. Purpose Emphasis: [Speaker creates motivation by connecting immediate tasks or goals to larger meaningful impacts for the team, organization, customers, or society.]
          4. Growth Mindset Activation: [Speaker encourages development by reframing challenges as learning opportunities, fostering resilience, and emphasizing that abilities can be developed through effort.]
          5. Energy Building: [Speaker generates momentum through dynamic language, enthusiasm, or interactive patterns to create positive emotional energy and collective drive.]
          6. Not Motivational: [The communication does not involve attempts to motivate or inspire, e.g., purely informational or procedural discussion.]

        Example Output:
            {{
                "type": "Belief Reinforment",
                "reasoning": "{target_person} is motivating and inpsiring by saying, 'We got this, remmeber Q2, we also managed to hit our targets?' which highlights past sucesses."
            }}
            
         OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
        """
    set_templates()

In [51]:
def goal_setting_prompt_builder(target_person):
    target_person = target_person
    global activity_name
    global prompt_mission
    global agent_detection_prompt
    global sublabel_prompt

    activity_name ="Goal Setting"

    prompt_mission = """
   Your task is to identify / detect sequences of segments in the transcript where participants are engaging in goal setting activities. Look for these key indicators of goal setting dynamics:

1. Goal Definition:
   - Setting Specific objectives with clear outcomes
   - Establishing Measurable success criteria
   - Agreeing on Achievable targets
   - Ensuring goals are Relevant to role/team
   - Setting Time-bound deadlines and milestones

2. Goal Alignment:
   - Connecting individual goals to team objectives
   - Linking team goals to organizational strategy
   - Discussing goal dependencies between team members
   - Clarifying how goals support broader initiatives
   - Addressing potential goal conflicts

3. Resource Planning:
   - Identifying required resources for goal achievement
   - Discussing skill development needs
   - Allocating time and budget considerations
   - Planning for potential obstacles or constraints
   - Establishing support systems needed

4. Progress Tracking:
   - Setting up measurement criteria
   - Establishing check-in points
   - Defining key performance indicators (KPIs)
   - Creating accountability mechanisms
   - Planning progress review sessions

5. Goal Negotiation:
   - Discussing goal feasibility
   - Adjusting expectations based on constraints
   - Balancing challenging vs. achievable targets
   - Addressing concerns about goal difficulty
   - Reaching consensus on final objectives

When reviewing the transcript, note that goal setting may:
- Occur at different levels (individual, team, project)
- Include both short-term and long-term objectives
- Involve multiple stakeholders with different perspectives
- Require iterative refinement and adjustment
- Include both formal and informal goal-setting conversations
        """

    agent_detection_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. You will be provided a transcript in VTT format from a live meeting with multiple participants. Your task is to assess if {target_person} is actively involved in goal setting activities. Provide reasoning for your assessment and answer with Participating if {target_person} is actively engaged in goal setting processes or Not Participating if they are not involved. Answer with no additional information.
        Type Options and Definitions:

            •	Participating: {target_person} is actively contributing to goal setting through defining, discussing, negotiating, or planning goals.
            •	Not Involved: {target_person} is either receiving goals passively, observing goal discussions, or not involved in the goal setting process.


            Example Output:
                {{
                    "type": "Participating",
                      "reasoning": "{target_person} actively engaged in goal setting by proposing specific targets, discussing measurement criteria, and helping establish timelines for completion"
                }}

            Do not explain yourself, do not deviate from the format, do not output additional data points.

            OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
    """

    sublabel_prompt = f"""
        You are an expert performance management analyst, analyzing communication transcripts. Team success and development are at stake. You will be provided a summary of a moment that occurred within a meeting and then the transcript in VTT format from that moment. You will also be given the decision-making context in a JSON input. It is a live recording of a meeting with multiple participants.
        Your task is to label each segment based on the category it most aligns with using the VTT transcript and provide the result in JSON format with two fields: 'type' and 'reasoning.'
        Categories:
          1.	Visionary Goals: [Highest Altitude, Broad, long-term aspirations guiding the overarching mission and purpose, often with a multi-year or indefinite timeline. These goals answer the “why” of the organization’s direction.]
	       2.	Strategic Goals: [High Altitude, Long-term goals that translate the vision into actionable direction over several years, setting priorities for major initiatives, market positioning, and competitive edge. They answer the “what” of the organization’s direction.]
	       3.	Tactical Goals: [Mid-Altitude, Intermediate goals that break down strategic aims into focused initiatives within departments or teams. These typically span quarters or a fiscal year, answering the “how” for achieving strategic goals.]
	       4.	Operational Goals: [Low Altitude, Short-term, specific objectives set at the team or department level, focusing on immediate improvements or targets. Often month-to-month, they answer the “how” in a detailed, actionable way.]
	       5.	Task Goals: [Ground Level, Day-to-day goals at the individual or small-team level. They are the most actionable and detailed, focused on immediate tasks, timelines, or deliverables, answering the “what” and “how” at a task-specific level.]
          6.   Not Detected: [The communication does not involve goal setting activities, e.g., general discussion or unrelated topics.]
        Example Output:
            {{
                "type": "Tactical Goals",
                "reasoning": "{target_person} clearly defined a goal by stating 'Let's aim to increase our customer satisfaction score to 4.5 out of 5 by the end of Q3' which includes specific metrics and timeline."
            }}
           
         OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
    """
    set_templates()

In [52]:
def team_conflict_prompt_builder(target_person):
    target_person = target_person
    global activity_name
    global prompt_mission
    global agent_detection_prompt
    global sublabel_prompt

    activity_name ="Team Conflict"

    prompt_mission = """
   Your task is to identify / detect sequences of segments in the transcript where participants are exhibiting signs of team conflict. Look for these key indicators of conflict dynamics:
    1. Disagreement Escalation:
    - Direct opposition to ideas or suggestions
    - Increasing tension in tone and language
    - Interrupting or talking over others
    - Dismissive or defensive responses to contributions

    2. Communication Breakdown:
    - Miscommunication or misunderstanding of intentions
    - Lack of acknowledgment of others' viewpoints
    - Withdrawal from discussion or passive aggressive behavior
    - Silent treatment or minimal participation

    3. Power Dynamics:
    - Challenging authority or decision-making processes
    - Competing for control or influence
    - Undermining others' expertise or credentials
    - Formation of opposing subgroups or coalitions

    4. Emotional Manifestation:
    - Expressions of frustration, anger, resentment or other intense negative emotions
    - Personal attacks or blame assignment
    - Aggressive tone

    5. Process Friction:
    - Disputes over roles and responsibilities
    - Disagreements about methods or approaches
    - Resource allocation conflicts
    - Timeline or priority conflicts

    When reviewing the transcript, note that conflict may:
    - Escalate gradually through subtle cues before becoming explicit
    - Involve multiple participants taking different sides
    - Stem from both task-related and interpersonal sources
    - Manifest through both active confrontation and passive resistance
        """

    agent_detection_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. You will be provided a transcript in VTT format from a live meeting with multiple participants. Your task is to assess if {target_person} is actively involved in team conflicts. Provide reasoning for your assessment and answer with Participating if {target_person} is actively engaged in conflict dynamics or Not Participating if they are not involved. Answer with no additional information.

        Type Options and Definitions:

            •	Participating: {target_person} is actively contributing to or escalating conflict through their behavior or communication.
            •   Mediating: {target_person} is either a neutral party, mediator in the conflict dynamics.
            •	Not Involved: {target_person} is not involved in the conflict dynamics.


            Example Output:
                {{
                    "type": "Participating",
                      "reasoning": "{target_person} showed active conflict behavior by repeatedly interrupting others, expressing strong disagreement, and using dismissive language toward team members' suggestions"
                }}

            Do not explain yourself, do not deviate from the format, do not output additional data points.

            OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
    """

    sublabel_prompt = f"""
        You are an expert linguist, analyzing communication transcripts. World peace is at stake. You will be provided a summary of a moment that occurred within a meeting and then the transcript in VTT format from that moment. You will also be given the decision-making context in a JSON input. It is a live recording of a meeting with multiple participants.

        Your task is to label each segment based on the category it most aligns with using the VTT transcript and provide the result in JSON format with two fields: ‘type’ and ‘reasoning.’
        Categories:
            1.	Cultural Conflict: [Highest Altitude, Conflicts stemming from differing work values, norms, or styles, such as varying views on decision-making speed, risk tolerance, or work-life balance. These differences can lead to friction in collaboration.]
	        2.	Strategic Conflict: [High Altitude, Disagreements on the organization’s long-term goals or priorities, where teams have differing views on direction, resource allocation, or which initiatives are most important, impacting overall alignment.]
	        3.	Functional Conflict: [Mid-Altitude, Tensions between departments due to competing objectives, dependencies, or unclear roles, leading to operational inefficiencies and challenges in cross-team collaboration.]
	        4.	Role Conflict: [Low Altitude, Issues caused by overlapping responsibilities or ambiguous roles within or between teams, leading to confusion over task ownership, accountability, and productivity.]
	        5.	Interpersonal Conflict: [Ground Level, Personal disagreements between team members based on differences in communication style, personality, or expectations, affecting team cohesion and morale.]
            6.  Not Detected: [The communication does not involve conflict indicators, e.g., normal professional discussion or constructive disagreement.

          
        Example Output:
            {{
                "type": "Interpersonal Conflict",
                "reasoning": "{target_person} escalated tension by saying, 'That's completely wrong and you know it, the color of your shirt is shit' showing direct confrontational behavior."
            }}
           
         OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
    """
    set_templates()

In [53]:
json_data_scheme = ""
activity_detection_prompt = ""

vtt_data_structure_example = """
    {segment_number}
    {start_time} --> {end_time}
    <v {speaker}>{transcription_content}
    """

def set_templates():
  global json_data_scheme
  global activity_detection_prompt
  json_data_scheme = """
  {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "title": "Sequence Schema",
    "type": "array",
    "items": {
      "type": "object",
      "properties": {
        "sequence_id": {
          "type": "integer",
          "description": "A unique identifier for each {activity_name} sequence"
        },
        "segment_id_sequence_start": {
          "type": "integer",
          "description": "The Segment number of the first segment within the identified {activity_name} sequence"
        },
        "segment_id_sequence_end": {
          "type": "integer",
          "description": "The Segment number of the last segment within the identified {activity_name} sequence"
        },
        "summary": {
          "type": "string",
          "description": "A brief explanation summarizing the interaction and why it was identified as {activity_name}"
        },
        "title": {
          "type": "string",
          "description": "A 4 to 6 word decsriptive for the clip that references the topic dicusussed"
        }
      },
      "required": [
        "sequence_id",
        "segment_id_sequence_start",
        "segment_id_sequence_end",
        "summary"
      ],
      "additionalProperties": false
    }
  }
  """


  # Full Prompt 
  activity_detection_prompt = f"""
      You are an expert linguist, whose job is to analyze communication transcripts. World peace is at stake.
      You will be provided a transcript in VTT format. It is a live recording of a meeting with multiple participants.The VTT contains an array of segments, each representing a portion of the conversation. Here is an example of one such segment, including a description of the key-value pairs:

      ```
      5
      00:24.494 --> 00:30.080
      <v William Hayden>And I just want to understand why that is what we can do to make sure that in the future

      6
      00:31.180 --> 00:36.340
      <v William Hayden>we retain top talent because that's like make or break for an organization, especially in this stage.
      ```

      This VTT transcript contains the following key components:
      ```
      Segment number: A unique identifier for this portion of the conversation
      Timestamp: The time range in which this segment of the conversation occurred
      Speaker: The name of the person speaking within angle brackets <v Speaker Name>
      Content: The actual text spoken during this segment
      ```

      ```
      {vtt_data_structure_example}
      ```
      
      The full transcript will consist of multiple such segments, each representing a distinct portion of the meeting conversation. These segments, when put together in order, form the complete transcript of the meeting.

      {prompt_mission}
      
      Those sequences must not overlap, meaning the same distinct segment must not be part of multiple sequences of segments where the {activity_name} has been identified / detected. Sequences should include context so someone reading it later can understand the interaction. Additionally, try to avoid very long sequences (no more than {sequence_max}) and very short sequences (no less than {sequence_min}); when the content of the transcript suggests a change of the conversation topic the sequence should end. Those sequences must not overlap.

      For each identified sequence of segments, provide the following data points in JSON format. Then, output your final result in a JSON-compatible array:

      JSON SCHEMA FOR THE OUTPUT:
      ```
  {json_data_scheme}
      ```


      Do not explain yourself, do not deviate from the format, do not output additional datapoints. Limit the number of outputs to a maximum of the {activity_max} most important instances of {activity_name}.

      OUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json
  """

In [54]:
def activity_detection(video_id):
    caption_text =  get_clean_vtt(video_id)

    # Send prompt and caption text to OpenAI for processing
    chat_completion = openai.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"{activity_detection_prompt}\n\n{caption_text}",
            }
        ],
        model="gpt-4o-mini",
        temperature=0,
    )

    # Extract response from the OpenAI completion
    activity_detection_response = chat_completion.choices[0].message.content

    # Initialize an empty DataFrame for storing enriched subtitles
    final_df = pd.DataFrame()
    pd.set_option("display.max_colwidth", None)

    # Assuming activity_detection_response is a JSON string or a list of dictionaries
    try:
        # Try loading it as JSON
        activity_data = pd.read_json(activity_detection_response)
    except ValueError:
        # If not a valid JSON string, try evaluating it as a list of dictionaries
        activity_data = pd.DataFrame(eval(activity_detection_response))

    # Create DataFrame with the activity detection data
    df_activity_detection = pd.DataFrame(activity_data)
    return df_activity_detection

def process_row(row):
    
    start_seq = row["segment_id_sequence_start"]
    end_seq = row["segment_id_sequence_end"]
    
    ## NEED TO FIX CURRENTLY TIMESTAMPS ARE WRONG
    vtt_df = get_vtt_df(video_id)
    
    try:
        start_timestamp = vtt_df.loc[vtt_df['index'] == start_seq, 'start'].values[0]
    except IndexError:
        print(f"No match found for start_seq {start_seq}")
        start_timestamp = None

    try:
        end_timestamp = vtt_df.loc[vtt_df['index'] == end_seq, 'end'].values[0]
    except IndexError:
        end_timestamp = vtt_df.loc[vtt_df['index'] == end_seq-1, 'end'].values[0]
    except IndexError:
        print(f"No match found for end_seq {end_seq}")
        end_timestamp = None


    return pd.Series(
        {
            "segment_start_timestamp": start_timestamp,
            "segment_end_timestamp": end_timestamp,
            "segment_start_timestamp_in_seconds": VTTUtils.parse_timestamp(start_timestamp),
            "segment_end_timestamp_in_seconds": VTTUtils.parse_timestamp(end_timestamp),
        }
    )

def process_activity_detection(df):
    df_activity_detection = df
    df_activity_detection[["segment_start_timestamp", "segment_end_timestamp", "segment_start_timestamp_in_seconds", "segment_end_timestamp_in_seconds"]] = (
        df_activity_detection.apply(process_row, axis=1)
    )
    return df_activity_detection


In [55]:
def finalize_activity_detection(df):
    final_df = df

    # Convert JSON strings in 'activity_analysis' and 'target_person_analysis' to dictionaries
    final_df['activity_analysis'] = final_df['activity_analysis'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    final_df['target_person_analysis'] = final_df['target_person_analysis'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

    # Extract 'type' and 'reasoning' from 'activity_analysis'
    final_df['activity_type'] = final_df['activity_analysis'].apply(lambda x: x['type'] if isinstance(x, dict) else None)
    final_df['activity_reasoning'] = final_df['activity_analysis'].apply(lambda x: x['reasoning'] if isinstance(x, dict) else None)

    # Extract 'type' and 'reasoning' from 'target_person_analysis' (treat as dictionary, not list)
    final_df['target_person_type'] = final_df['target_person_analysis'].apply(lambda x: x['type'] if isinstance(x, dict) else None)
    final_df['target_person_reasoning'] = final_df['target_person_analysis'].apply(lambda x: x['reasoning'] if isinstance(x, dict) else None)

    # Drop the original JSON columns
    final_df = final_df.drop(columns=['activity_analysis', 'target_person_analysis'])

    # Add a column that is always 'Feedback'
    final_df['activity'] = f"{activity_name}"

    # Convert segment_start_timestamp_in_seconds and segment_end_timestamp_in_seconds to integers
    final_df['segment_start_timestamp_in_seconds'] = final_df['segment_start_timestamp_in_seconds'].astype(int)
    final_df['segment_end_timestamp_in_seconds'] = final_df['segment_end_timestamp_in_seconds'].astype(int)

    # Add 'Moment_url' column based on video_id and segment_start_timestamp_in_seconds
    final_df['Moment_url'] = final_df.apply(lambda row: f"https://embed.api.video/vod/{row['video_id']}#;t={row['segment_start_timestamp_in_seconds']}", axis=1)

    return final_df

def agent_detection_sublabeling(df):
    df_activity_detection = df
    # Check if the columns exist, if not, create them
    if 'activity_analysis' not in df_activity_detection.columns:
        df_activity_detection['activity_analysis'] = None
    if 'target_person_analysis' not in df_activity_detection.columns:
        df_activity_detection['target_person_analysis'] = None

    # Loop through each row of the df_activity_detection DataFrame
    for index, row in df_activity_detection.iterrows():
        start_segment = int(row['segment_id_sequence_start'])
        end_segment = int(row['segment_id_sequence_end'])
        summary = row['summary']
        
        # Access the timestamps
        segment_start_timestamp = row['segment_start_timestamp']
        segment_end_timestamp = row['segment_end_timestamp']
        
        # Ensure end_segment is not smaller than start_segment
        if end_segment < start_segment:
            temp = end_segment
            end_segment = start_segment
            start_segment = temp
        
        # Extract the relevant portion of the transcript
        relevant_transcript = extract_segments_by_ids(caption_text, start_segment, end_segment)

        # First OpenAI API call: Target person analysis
        chat_completion_target_person = openai.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"{agent_detection_prompt}\n\n{relevant_transcript}\n\nOUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json",
                }
            ],
            model="gpt-4o-mini",  # Ensure the correct model ID is used
            temperature=0
        )

        target_person_analysis = chat_completion_target_person.choices[0].message.content

        # Second OpenAI API call: Activity analysis
        chat_completion_activity = openai.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"{sublabel_prompt}\n\n JSON INPUT: {target_person_analysis}\n\n TRANSCRIPT: {relevant_transcript}\n\nOUTPUT THE JSON OBJECT ONLY. Your output will be passed to `JSON.parse()`. Do not prefix with anything. Absolutely anything, not even ```json",
                }
            ],
            model="gpt-4o-mini",  # Ensure the correct model ID is used
            temperature=0
        )

        activity_analysis = chat_completion_activity.choices[0].message.content

        # Store the results back into df_activity_detection as JSON strings
        df_activity_detection.at[index, 'activity_analysis'] = json.dumps(json.loads(activity_analysis))  # Store as JSON string
        df_activity_detection.at[index, 'target_person_analysis'] = json.dumps(json.loads(target_person_analysis))  # Store as JSON string

        # Add 'video_id' column and populate it with video['videoId']
        df_activity_detection['video_id'] = video['videoId']

    # Now the df_activity_detection DataFrame contains the updated information for each row
    final_results = {
        'video_id': video['videoId'],
        'activity_sequences': df_activity_detection.to_dict('records')  # Convert DataFrame to list of dictionaries
    }
    return df_activity_detection


In [56]:
def get_video_idsv2():
    with apivideo.AuthenticatedApiClient(os.getenv("API_VIDEO_API_KEY")) as api_client:
        # Create an instance of the API class
        api_instance = videos_api.VideosApi(api_client)
        sort_by = "publishedAt" # str | Use this parameter to sort videos by the their created time, published time, updated time, or by title. (optional)
        sort_order = "asc" # str | Use this parameter to sort results. `asc` is ascending and sorts from A to Z. `desc` is descending and sorts from Z to A. (optional)
        page_size = 100 # int | Results per page. Allowed values 1-100, default is 25. (optional) if omitted the server will use the default value of 25

        # example passing only required values which don't have defaults set
        # and optional values
        try:
            # List all video objects
            api_response = api_instance.list(sort_by=sort_by, sort_order=sort_order, page_size=page_size)
            #pprint(api_response)
        except apivideo.ApiException as e:
            print("Exception when calling VideosApi->list: %s\n" % e)
        api_response = api_response.get('data')
        video_ids = [item['video_id'] for item in api_response]
        return video_ids

    return api_response

def get_video_object(video_id):
    # Enter a context with an instance of the API client
    with apivideo.AuthenticatedApiClient(os.getenv("API_VIDEO_API_KEY")) as api_client:
        # Create an instance of the API class
        api_instance = videos_api.VideosApi(api_client)
        video_id = video_id # str | The unique identifier for the video you want details about.

        # example passing only required values which don't have defaults set
        try:
            # Show a video
            api_response = api_instance.get(video_id)
        except apivideo.ApiException as e:
            print("Exception when calling VideosApi->get: %s\n" % e)
    return api_response

def get_delta(video_api_current_video_ids):
    all_video_ids = video_api_current_video_ids
    engine = create_engine(connection_string)
    # Run query and get results into a pandas DataFrame

    with engine.connect() as connection:
        query = text("SELECT video_api_id FROM public.meetings")
        result = connection.execute(query)
        df = pd.DataFrame(result.fetchall(), columns=result.keys())

    # Close connection
    engine.dispose()
    supabase_current_video_ids = list(df['video_api_id'])
    supabase_missing_video_ids = [video for video in all_video_ids if video not in supabase_current_video_ids]
    return supabase_missing_video_ids

def write_meeting_table_data_to_supabase(data):
    meetings = data
    try:
        # Write DataFrame to PostgreSQL
        meetings.to_sql(
            'meetings',  # replace with your table name
            engine,
            if_exists='append',  # 'replace' if you want to overwrite, 'fail' if you want to error if exists
            index=False,
            method='multi',
            chunksize=1000)
        print("Data successfully written to database")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        engine.dispose()  # Clean up connection

def get_clean_vtt(video_id):
    engine = create_engine(connection_string)
    # Run query and get results into a pandas DataFrame

    with engine.connect() as connection:
        query = text("SELECT clean_vtt_file FROM public.meetings WHERE video_api_id = '" + video_id + "'" )
        result = connection.execute(query)
        df = pd.DataFrame(result.fetchall(), columns=result.keys())

    # Close connection
    engine.dispose()
    clean_vtt_file = df['clean_vtt_file']
    clean_vtt_file = clean_vtt_file[0]
    return clean_vtt_file

def update_profile_meeting_table():
    engine = create_engine(connection_string)
    # Run query and get results into a pandas DataFrame

    with engine.connect() as connection:
        query = text("""insert into
  public.profile_meetings (id, profile_id, meetings_id)
select
  concat(profiles.id, meetings.video_api_id) as "id",
  profiles.id as "profile_id",
  meetings.video_api_id as "meetings_id"
from
  public.profiles
  left join public.meetings on public.meetings.speaker = public.profiles.nickname
where
  public.meetings.video_api_id not in (
    select
      meetings_id
    from
      public.profile_meetings
  )
on conflict (id) do nothing;""")

    # Close connection
    engine.dispose()
    return 

def update_segments(supabase_missing_video_ids):
  
  final_df_v2 = pd.DataFrame()
  for video_id in supabase_missing_video_ids:
    print(f"Processing video ID: {video_id}")
    raw_vtt = get_clean_vtt(video_id)
    vtt_in_df = parse_vtt_to_df(raw_vtt, video_id)
    clean_vtt_df = combine_consecutive_speakers(vtt_in_df)
    enriched_clean_vtt_df = enrich_clean_vtt_df(clean_vtt_df)

    final_df_v2 = pd.concat([final_df_v2, enriched_clean_vtt_df], ignore_index = True)
  
  print("VTT VAD dataset successfully build for all videos")
  
  # Your column mapping stays the same
  column_mapping = {
      'video_id': 'video_api_id',
      'start': 'start_timestamp',
      'end': 'end_timestamp',
      'speaker': 'speaker_name',
      'hard_filler_count': 	'hard_filler_word_count',
      'soft_filler_count': 'soft_filler_word_count', 	
  }
  
  final_df_v2_renamed = final_df_v2.rename(columns=column_mapping)
  
  # Create enhancement_id (two approaches you can try):
  # Approach 1 - direct UUID objects
  final_df_v2_renamed['id'] = final_df_v2_renamed.apply(lambda row: row['video_api_id'] + str(row['index']), axis=1) 
  
  segment_columns = ['id','video_api_id', 'index', 'start_timestamp', 
                    'end_timestamp', 'text', 'speaker_name', 'word_count', 'sentence_count', 'duration', 'hard_filler_word_count', 'soft_filler_word_count', 'profanity_count', 'question_count', 'vad_word_count', 'total_valence', 'total_arousal', 'total_dominance']
  
  segments = final_df_v2_renamed
  segments = segments[segment_columns]
  return segments

def write_segements_table_to_supabase(segments):
  # Write to database
  engine = create_engine(connection_string)
  try:
      # Write DataFrame to PostgreSQL
      segments.to_sql(
          'segments',  # replace with your table name
          engine,
          if_exists='append',  # 'replace' if you want to overwrite, 'fail' if you want to error if exists
          index=False,
          method='multi',
          chunksize=1000  # adjust based on your data size
      )
      print("Data successfully written to database")
  except Exception as e:
      print(f"An error occurred: {e}")
  finally:
      engine.dispose()  # Clean up connection


In [57]:
def add_new_moments(df):

  moments = df
  column_mapping = {
      'video_id': 'video_api_id',
      'Moment_url': 'moment_url' 	
  }
  
  moments_renamed = moments.rename(columns=column_mapping)
  
  # Create enhancement_id (two approaches you can try):
  # Approach 1 - direct UUID objects
  moments_renamed['id'] = [str(uuid.uuid4()) for _ in range(len(moments_renamed))]
  moments_renamed['latest'] = 'TRUE'
  moments_columns = ['id','segment_id_sequence_start', 'segment_id_sequence_end', 'summary', 'title', 'segment_start_timestamp', 'segment_end_timestamp', 'segment_start_timestamp_in_seconds', 'segment_end_timestamp_in_seconds', 'video_api_id', 'activity_type', 'activity_reasoning', 'target_person_type', 'target_person_reasoning', 'activity', 'moment_url', 'latest']
  
  
  moments_renamed = moments_renamed[moments_columns]
  
  # Write to database
  try:
      # Write DataFrame to PostgreSQL
      moments_renamed.to_sql(
          'moments',  # replace with your table name
          engine,
          if_exists='append',  # 'replace' if you want to overwrite, 'fail' if you want to error if exists
          index=False,
          method='multi',
          chunksize=1000  # adjust based on your data size
      )
      print("Data successfully written to database")
  except Exception as e:
      print(f"An error occurred: {e}")
  finally:
      engine.dispose()  # Clean up connection
  return moments_renamed

def expand_moments_to_segments(moments_df):
    """
    Expands a moments dataframe to create multiple rows for each sequence of segment IDs,
    where segment_id is a concatenation of video_api_id and the sequence number.
    
    Parameters:
    moments_df (pd.DataFrame): DataFrame with columns:
        - id
        - segment_id_sequence_start
        - segment_id_sequence_end
        - video_api_id
    
    Returns:
    pd.DataFrame: Expanded DataFrame with columns:
        - id (generated sequential ID)
        - moments_id (original moment ID)
        - segment_id (concatenated video_api_id + sequence number)
        - video_api_id
    """
    # Create empty list to store rows
    expanded_rows = []
    
    # Counter for new sequential IDs
    current_id = 1
    
    # Iterate through each moment
    for _, row in moments_df.iterrows():
        # Generate sequence of segment IDs
        segment_numbers = range(
            row['segment_id_sequence_start'],
            row['segment_id_sequence_end'] + 1
        )
        
        # Create a row for each segment ID
        for segment_number in segment_numbers:
            # Concatenate video_api_id with segment number
            segment_id = f"{row['video_api_id']}{segment_number}"
            
            expanded_rows.append({
                'id': str(uuid.uuid4()),
                'moments_id': row['id'],
                'segment_id': segment_id,
                'video_api_id': row['video_api_id']
            })
            current_id += 1
    moments_segment = pd.DataFrame(expanded_rows)
    # Create new DataFrame from expanded rows
    try:
    # Write DataFrame to PostgreSQL
      moments_segment.to_sql(
          'moments_segment',  # replace with your table name
          engine,
          if_exists='append',  # 'replace' if you want to overwrite, 'fail' if you want to error if exists
          index=False,
          method='multi',
          chunksize=1000  # adjust based on your data size
      )
      print("Data successfully written to database")
    except Exception as e:
      print(f"An error occurred: {e}")
    finally:
      engine.dispose()  # Clean up connection
    return moments_segment
  
def mark_old_latest_moment(videoids, activities):
   # Update query to set latest=FALSE
   query = text("UPDATE public.moments SET latest = FALSE WHERE video_api_id IN :videoids AND activity IN :activities AND latest = TRUE")
   print("Executing Query on Database: " + str(query))
   
   engine = create_engine(connection_string)
   
   # Run update query with parameters
   with engine.connect() as connection:
       result = connection.execute(query, {
           "videoids": tuple(videoids), 
           "activities": tuple(activities)
       })
       # Commit the transaction
       connection.commit()
   
   # Close connection
   engine.dispose()
   return result.rowcount  # Returns number of rows updated


def mark_moment_as_irrelevant():
   # Update query to set latest=FALSE
   query = text("UPDATE moments SET relevant = FALSE WHERE (activity_type = 'Not Detected' OR target_person_type = 'Not Involved')")
   print("Executing Query on Database: " + str(query))
   
   engine = create_engine(connection_string)
   
   # Run update query with parameters
   with engine.connect() as connection:
       result = connection.execute(query)
       # Commit the transaction
       connection.commit()
   
   # Close connection
   engine.dispose()
   return result.rowcount  # Returns number of rows updated

In [58]:
def update_meeting_tags(video_id):
    meta_data = get_video_object(video_id)
    tags = meta_data.get('tags')[0]

    query = text("UPDATE public.meetings SET speaker = :tags WHERE video_api_id = :video_id")
    engine = create_engine(connection_string)
   
   # Run update query with parameters
    with engine.connect() as connection:
        result = connection.execute(query, {
            "video_id": video_id, 
            "tags": tags
            })
        # Commit the transaction
        connection.commit()
    
    # Close connection
    engine.dispose()
    return result.rowcount  # Returns number of rows updated

def get_meetings_with_missing_speaker():
    with engine.connect() as connection:
        query = text("SELECT video_api_id FROM public.meetings WHERE speaker IS NULL")
        result = connection.execute(query)
        df = pd.DataFrame(result.fetchall(), columns=result.keys())

    # Close connection
    engine.dispose()
    supabase_current_video_ids = list(df['video_api_id'])
    return supabase_current_video_ids

def get_meeting_baas_data(video_id):
    try: 
        temp = get_video_object(video_id).get('metadata')[1].get('key')
        data = get_video_object(video_id).get('metadata')[1].get('value') 
        is_meeting_baas = (temp == "meeting_baas_raw_data")
        if is_meeting_baas:
            is_meeting_baas = data
        return is_meeting_baas
    except IndexError:
        is_meeting_baas = False
        pprint("No Metadata or not Meeting Baas")
        return is_meeting_baas
    except Exception as e:
        is_meeting_baas = False
        print(f"An error occurred: {e}")
        return is_meeting_baas
    

In [59]:
def extract_vtt_transcript(meeting_data):
    """
    Extracts transcript data from the meeting JSON data structure and formats it as VTT
    with speaker tags.
    
    Args:
        meeting_data (dict): The meeting data dictionary containing transcript information
        
    Returns:
        str: A string containing the transcript in VTT format
    """
    transcripts = meeting_data.get('transcripts', [])
    vtt_content = "WEBVTT\n\n"  # VTT header
    
    segment_id = 0
    for segment in transcripts:
        # Only include segments that have words
        if segment['words']:
            # Format timestamps
            start_time = format_timestamp(segment['start_time'])
            end_time = format_timestamp(segment['end_time'])
            timestamp = f"{start_time} --> {end_time}"
            
            # Format speaker and text
            speaker = f"<v {segment['speaker']}>"
            text = ' '.join(word['text'] for word in segment['words'])
            
            # Combine into VTT format
            vtt_content += f"{segment_id}\n{timestamp}\n{speaker} {text}\n\n"
            segment_id += 1
    
    return vtt_content

def format_timestamp(seconds):
    """
    Convert seconds to VTT timestamp format (HH:MM:SS.mmm)
    """
    if seconds < 0:
        return "00:00.000"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds_remaining = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{seconds_remaining:06.3f}"

def process_meeting_to_vtt(raw_data):
    """
    Process the raw meeting data and convert to VTT format.
    
    Args:
        raw_data (str): The raw meeting data string
        
    Returns:
        str: VTT formatted transcript
    """
    # Clean up the string and parse JSON
    cleaned_data = raw_data.strip('()')
    meeting_data = json.loads(cleaned_data)
    
    return extract_vtt_transcript(meeting_data)

In [60]:
def generate_meeting_table_data(supabase_missing_video_ids):
  
  if len(supabase_missing_video_ids) > 0:
    meetings = []
    # Create a list of dictionaries with the required fields
    for video_id in supabase_missing_video_ids:

      meeting_baas_data = get_meeting_baas_data(video_id)

      if(meeting_baas_data == False):
        raw_data = get_video_object(video_id)
        original_vtt_file = get_caption_text(video_id)
        clean_vtt_file = build_clean_vtt_adjust(original_vtt_file, video_id)
        summary = get_meeting_summary(clean_vtt_file, raw_data.get('tags')[0])
        video_data = {
        'video_api_id' : video_id,    
        'name' : raw_data.get('title'),
        'date' : raw_data.get('published_at'),
        'speaker' : raw_data.get('tags')[0],
        'original_vtt_file' : original_vtt_file,
        'clean_vtt_file' : clean_vtt_file,
        'summary' : summary
        }
        meetings.append(video_data)
        break
      
      raw_data = get_video_object(video_id)
      original_vtt_file = process_meeting_to_vtt(meeting_baas_data)
      clean_vtt_file = build_clean_vtt_adjust(original_vtt_file, video_id)
      summary = get_meeting_summary(clean_vtt_file, raw_data.get('tags')[0])
      
      video_data = {
      'video_api_id' : video_id,    
      'name' : raw_data.get('title'),
      'date' : raw_data.get('published_at'),
      'speaker' : raw_data.get('tags')[0],
      'meeting_baas_original_json': meeting_baas_data,
      'original_vtt_file' : original_vtt_file,
      'clean_vtt_file' : clean_vtt_file,
      'summary' : summary
      }
      meetings.append(video_data)
    
    meetings = pd.DataFrame(meetings)
    return meetings

In [19]:
vad_lexicon = load_nrc_vad_lexicon(vad_lexicon_filepath)
video_api_current_video_ids = get_video_idsv2()
supabase_missing_video_ids = get_delta(video_api_current_video_ids)
#pprint(supabase_missing_video_ids)
supabase_missing_video_ids = ['vi3vHBTcK9aO4oZtY4uuNraW']
meetings = generate_meeting_table_data(supabase_missing_video_ids)
#pprint(meetings)
write_meeting_table_data_to_supabase(meetings)
#update_profile_meeting_table() -- Does not properly work while we do not have all users that currently have videos correctly onboarded
segments = update_segments(supabase_missing_video_ids)
write_segements_table_to_supabase(segments)

# Other things for debugging
#print(supabase_missing_video_ids)
#supabase_missing_video_ids = ['vi5lUKF5SojsvO2eH5MdSXVs', 'vi4iA7ftEHA7F7kXxBz5TW1s', 'vi4Yb16priLky4Ze4x3ApG3C', 'vi7GkawMp69I47lCmepGg2Y1', 'vi17iPCCfSCPTTMMwSe8pF4v', 'vi5oPyx5KnBrR7JX4o3IKRxL', 'vi5tQVPfxGp8bEIsPrkYLc2t', 'vimgkWn5yc9eZpob5nXB4k5', 'vi82EUZk4sRGrV4MBYaYhyr', 'vi4Ro4yE0PR9xSZ7JKYTU2r6', 'vi4cHtqlN2qHZYWEcurYLXic', 'vi2r3uY6mjoU6gc6QjaMcZFk', 'vi2jqz22wk9sKKz8udDFa4Mv', 'vi3lhBdLr1b48zymbStBq65m', 'vi5RzuGHq6hVTXUZ6S2gc5FI', 'vi1RUWYlkj8srU7xJWhvIgf4', 'vi3veataLjXlJv4dEEvBGUIS', 'vi5dVoW7a6py6gDqonevdQhG', 'vi6izaRPNkne58IZ7zzjofeH', 'vi4fEA572F2t8znnmHtDUsPu']

'No Metadata or not Meeting Baas'
Data successfully written to database
Processing video ID: vi3vHBTcK9aO4oZtY4uuNraW
VTT VAD dataset successfully build for all videos
Data successfully written to database


In [33]:
def emotion_detection(video_id):
    prompt = emotion_prompt_builder()
    caption_text = get_clean_vtt(video_id)

    payload = {
        "model": "gpt-4o-mini-2024-07-18",  # Corrected model name
        "messages": [
            {
                "role": "user",
                "content": f'{prompt}\n\n{caption_text}'
            }
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "Emotions",
                "description": "Emotions",
                "schema": {
                
                "type": "object",
                "properties": {
                    "emotion_sequences": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "sequence_id": {
                                    "type": "integer",
                                    "description": "A unique identifier for each sequence"
                                },
                                "segment_id_sequence_start": {
                                    "type": "integer",
                                    "description": "The `segment_number` of the first segment within the identified sequence"
                                },
                                "segment_id_sequence_end": {
                                    "type": "integer",
                                    "description": "The `segment_number` of the last segment within the identified sequence"
                                },
                                "speaker_name": {
                                    "type": "string",
                                    "description": "The name of the speaker it's all about"
                                },
                                "emotion": {
                                    "type": "string",
                                    "description": "The emotion that the speaker is demonstrating"
                                },
                                "emotion_intensity": {
                                    "type": "integer",
                                    "description": "The intensity of the emotion that the speaker is demonstrating",
                                    "minimum": 0,
                                    "maximum": 10
                                },
                                "reasoning": {
                                    "type": "string",
                                    "description": "A brief explanation summarizing the reasoning behind your selection"
                                },
                                "context": {
                                    "type": "string",
                                    "description": "A brief explanation summarizing the context of the identified emotion"
                                }
                            },
                            "required": [
                                "sequence_id",
                                "segment_id_sequence_start",
                                "segment_id_sequence_end",
                                "speaker_name",
                                "emotion",
                                "emotion_intensity",
                                "reasoning",
                                "context"
                            ]
                        }
                    }
                },
                "required": [
                    "emotion_sequences"
                ]
            }
            }
        }
    }

    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}'
    }

    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        json=payload,
        headers=headers
    )
    response.raise_for_status()

    chat_completion = response.json()

    # Extract the content from the response
    res = chat_completion['choices'][0]['message']['content']

    json_data = json.loads(res)
    df = pd.DataFrame(json_data['emotion_sequences'])
    return df


In [34]:
temp = emotion_detection("vi1RUWYlkj8srU7xJWhvIgf4")
temp.head()







Unnamed: 0,sequence_id,segment_id_sequence_start,segment_id_sequence_end,speaker_name,emotion,emotion_intensity,reasoning,context
0,1,14,14,AJ Goldstein,Curiosity,5,"AJ asks Darren about how he's handling the election results, indicating he is curious about Darren's perspective.",AJ attempts to engage Darren in a conversation about personal sentiments regarding current events.
1,2,20,20,AJ Goldstein,Nostalgia,7,AJ expresses fond memories and appreciation when talking about his past experiences and relationships with Jeff.,AJ reminisces about his early connections and mentorship received from Jeff while discussing their shared experiences.
2,3,40,40,AJ Goldstein,Conviction,8,AJ strongly believes in the value of his business model and expresses confidence in their target market and success.,AJ discusses the importance of targeting STEM founders and expands on the empirical success they are experiencing.
3,4,59,59,Darren,Apprehension,6,"Darren hesitates to engage fully due to obligations tied to his role as a fund manager, indicating a level of concern about stepping outside of those boundaries.","In response to AJ's request for broader collaboration, Darren expresses reluctance to extend beyond his existing fund's agreements."
4,5,58,58,AJ Goldstein,Optimism,9,"AJ showcases an optimistic outlook on the potential for collaboration and the successful future of his coaching business, reflecting high energy and hope.",AJ reaffirms his excitement about potential partnerships and the positive value he sees in his operations.


In [None]:
video_id = "vi1RUWYlkj8srU7xJWhvIgf4"
temp2 = process_activity_detection(temp)
  moments_emotions = temp
  column_mapping = {
      'context': 'summary',
      'emotion': 'activity_type',
      'reasoning': 'activity_reasoning',
       	
  }
  
  moments_renamed = moments_emotions.rename(columns=column_mapping)
  
  # Create enhancement_id (two approaches you can try):
  # Approach 1 - direct UUID objects
  moments_renamed['video_api_id'] = video_id
  moments_renamed['id'] = [str(uuid.uuid4()) for _ in range(len(moments_renamed))]
  moments_renamed['latest'] = 'TRUE'
  ##moments_columns = ['id','segment_id_sequence_start', 'segment_id_sequence_end', 'summary', 'title', 'segment_start_timestamp', 'segment_end_timestamp', 'segment_start_timestamp_in_seconds', 'segment_end_timestamp_in_seconds', 'video_api_id', 'activity_type', 'activity_reasoning', 'target_person_type', 'target_person_reasoning', 'activity', 'moment_url', 'latest']
  ##moments_renamed = moments_renamed[moments_columns]

Current Columns in Moments

id --> Good (UUID)
segment_id_sequence_start --> Good
segment_id_sequence_end --> Good
summary --> Context
title --> Emotion?
segment_start_timestamp --> Good
segment_end_timestamp  --> Good
segment_start_timestamp_in_seconds --> Good
segment_end_timestamp_in_seconds --> Good
video_api_id --> Good
activity_type --> The of emotion
activity_reasoning --> reasoning
target_person_type --> speaker
target_person_reasoning --> Null
activity --> Emotion
score --> intensity
latest --> DEFAULT TRUE



In [61]:
activity_type_selector = ["Feedback",
                         "Decision Making",
                         "Delegation",
                         "Team Conflict",
                         "Goal Setting"
  #"Test"
                         ]

## Apply to meeting that have been recently added
#ideo_selector = filtered_data
video_selector = [{"videoId":"vi6TBTdOQ3bkW2b7ku6xtvbK", "tags":["Kanishka Rao"]},
{"videoId":"vi3MC0JsaO4n4cA7cYgYzpb4", "tags":["Kanishka Rao"]},
{"videoId":"vi4jzbO8E3eWXzvCGz7XNI0w", "tags":["Kanishka Rao"]},
{"videoId":"vi4flB6clm0iow6IFTbC7emI", "tags":["Kanishka Rao"]},
{"videoId":"vi7YPIONLTIkuIc1odaRFv5", "tags":["Kanishka Rao"]},
{"videoId":"vi18qSrl7wIdeYOKuDg7YQBx", "tags":["Kanishka Rao"]},
{"videoId":"vi6aqbh5ikJLOYkqOdO7TvN1", "tags":["Kanishka Rao"]},
{"videoId":"vi7ITVcbI2cXqYj2CnwKvSXg", "tags":["Kanishka Rao"]},
{"videoId":"vi5lUKF5SojsvO2eH5MdSXVs", "tags":["Kanishka Rao"]},
{"videoId":"vi31gHvd0MMz8FonUstjUyMe", "tags":["Kanishka Rao"]},
{"videoId":"vi1GPAuvhakg74TUpNoMMUvc", "tags":["Kanishka Rao"]}]

api_video = ApiVideoAuth(os.getenv("API_VIDEO_API_KEY"))
api_video.authenticate()

openai = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)


final_df = pd.DataFrame()

for video in video_selector:
    print("Starting Loop")
    target_person = video['tags'][0]
    video_id = video['videoId']
    video = api_video.get_video(video_id)
    caption_text = build_clean_vtt(video_id)

    if "Goal Setting" in activity_type_selector:
        print(f"Processing video ID: {video_id}, for Goal Setting targeted towards {target_person}")

        goal_setting_prompt_builder(target_person)

        ## Get activity detection output from video 
        df_activity_detection = activity_detection(video_id)

        ## Add seconds timestamp to the segments
        df_activity_detection = process_activity_detection(df_activity_detection)

        ## Run sublabelling and agent detection 
        df_activity_detection = agent_detection_sublabeling(df_activity_detection)

        ## Format JSON output to columns
        final_df_iteration = finalize_activity_detection(df_activity_detection)

        ## Append results to final output df
        final_df = pd.concat([final_df, final_df_iteration], ignore_index = True)

    if "Team Conflict" in activity_type_selector:
        print(f"Processing video ID: {video_id}, for Team Conflict targeted towards {target_person}")

        team_conflict_prompt_builder(target_person)

        ## Get activity detection output from video 
        df_activity_detection = activity_detection(video_id)

        ## Add seconds timestamp to the segments
        df_activity_detection = process_activity_detection(df_activity_detection)

        ## Run sublabelling and agent detection 
        df_activity_detection = agent_detection_sublabeling(df_activity_detection)

        ## Format JSON output to columns
        final_df_iteration = finalize_activity_detection(df_activity_detection)

        ## Append results to final output df
        final_df = pd.concat([final_df, final_df_iteration], ignore_index = True)

    if "Feedback" in activity_type_selector:
        print(f"Processing video ID: {video_id}, for Feedback targeted towards {target_person}")

        feedback_prompt_builder(target_person)

        ## Get activity detection output from video 
        df_activity_detection = activity_detection(video_id)

        ## Add seconds timestamp to the segments
        df_activity_detection = process_activity_detection(df_activity_detection)

        ## Run sublabelling and agent detection 
        df_activity_detection = agent_detection_sublabeling(df_activity_detection)

        ## Format JSON output to columns
        final_df_iteration = finalize_activity_detection(df_activity_detection)

        ## Append results to final output df
        final_df = pd.concat([final_df, final_df_iteration], ignore_index = True)

    if "Delegation" in activity_type_selector:
        print(f"Processing video ID: {video_id}, for Delegation targeted towards {target_person}")

        delegation_prompt_builder(target_person)

        ## Get activity detection output from video 
        df_activity_detection = activity_detection(video_id)

        ## Add seconds timestamp to the segments
        df_activity_detection = process_activity_detection(df_activity_detection)

        ## Run sublabelling and agent detection 
        df_activity_detection = agent_detection_sublabeling(df_activity_detection)

        ## Format JSON output to columns
        final_df_iteration = finalize_activity_detection(df_activity_detection)

        ## Append results to final output df
        final_df = pd.concat([final_df, final_df_iteration], ignore_index = True)

    if "Decision Making" in activity_type_selector:
        print(f"Processing video ID: {video_id}, for Decision Making targeted towards {target_person}")

        decision_making_prompt_builder(target_person)

        ## Get activity detection output from video 
        df_activity_detection = activity_detection(video_id)

        ## Add seconds timestamp to the segments
        df_activity_detection = process_activity_detection(df_activity_detection)

        ## Run sublabelling and agent detection 
        df_activity_detection = agent_detection_sublabeling(df_activity_detection)

        ## Format JSON output to columns
        final_df_iteration = finalize_activity_detection(df_activity_detection)

        ## Append results to final output df
        final_df = pd.concat([final_df, final_df_iteration], ignore_index = True)

print("Video selection has been process for moments activities selection")
final_df.head(500)


Starting Loop
Processing video ID: vi6TBTdOQ3bkW2b7ku6xtvbK, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6TBTdOQ3bkW2b7ku6xtvbK, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6TBTdOQ3bkW2b7ku6xtvbK, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6TBTdOQ3bkW2b7ku6xtvbK, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6TBTdOQ3bkW2b7ku6xtvbK, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi3MC0JsaO4n4cA7cYgYzpb4, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi3MC0JsaO4n4cA7cYgYzpb4, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi3MC0JsaO4n4cA7cYgYzpb4, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi3MC0JsaO4n4cA7cYgYzpb4, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi3MC0JsaO4n4cA7cYgYzpb4, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi4jzbO8E3eWXzvCGz7XNI0w, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4jzbO8E3eWXzvCGz7XNI0w, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4jzbO8E3eWXzvCGz7XNI0w, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4jzbO8E3eWXzvCGz7XNI0w, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4jzbO8E3eWXzvCGz7XNI0w, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi4flB6clm0iow6IFTbC7emI, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4flB6clm0iow6IFTbC7emI, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4flB6clm0iow6IFTbC7emI, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4flB6clm0iow6IFTbC7emI, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi4flB6clm0iow6IFTbC7emI, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi7YPIONLTIkuIc1odaRFv5, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7YPIONLTIkuIc1odaRFv5, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7YPIONLTIkuIc1odaRFv5, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7YPIONLTIkuIc1odaRFv5, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7YPIONLTIkuIc1odaRFv5, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi18qSrl7wIdeYOKuDg7YQBx, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi18qSrl7wIdeYOKuDg7YQBx, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi18qSrl7wIdeYOKuDg7YQBx, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi18qSrl7wIdeYOKuDg7YQBx, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi18qSrl7wIdeYOKuDg7YQBx, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi6aqbh5ikJLOYkqOdO7TvN1, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6aqbh5ikJLOYkqOdO7TvN1, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6aqbh5ikJLOYkqOdO7TvN1, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6aqbh5ikJLOYkqOdO7TvN1, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi6aqbh5ikJLOYkqOdO7TvN1, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi7ITVcbI2cXqYj2CnwKvSXg, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7ITVcbI2cXqYj2CnwKvSXg, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7ITVcbI2cXqYj2CnwKvSXg, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7ITVcbI2cXqYj2CnwKvSXg, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi7ITVcbI2cXqYj2CnwKvSXg, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi5lUKF5SojsvO2eH5MdSXVs, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi5lUKF5SojsvO2eH5MdSXVs, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi5lUKF5SojsvO2eH5MdSXVs, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi5lUKF5SojsvO2eH5MdSXVs, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi5lUKF5SojsvO2eH5MdSXVs, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi31gHvd0MMz8FonUstjUyMe, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi31gHvd0MMz8FonUstjUyMe, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi31gHvd0MMz8FonUstjUyMe, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi31gHvd0MMz8FonUstjUyMe, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi31gHvd0MMz8FonUstjUyMe, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Starting Loop
Processing video ID: vi1GPAuvhakg74TUpNoMMUvc, for Goal Setting targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi1GPAuvhakg74TUpNoMMUvc, for Team Conflict targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi1GPAuvhakg74TUpNoMMUvc, for Feedback targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi1GPAuvhakg74TUpNoMMUvc, for Delegation targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Processing video ID: vi1GPAuvhakg74TUpNoMMUvc, for Decision Making targeted towards Kanishka Rao


  activity_data = pd.read_json(activity_detection_response)


Video selection has been process for moments activities selection


Unnamed: 0,sequence_id,segment_id_sequence_start,segment_id_sequence_end,summary,title,segment_start_timestamp,segment_end_timestamp,segment_start_timestamp_in_seconds,segment_end_timestamp_in_seconds,video_id,activity_type,activity_reasoning,target_person_type,target_person_reasoning,activity,Moment_url
0,1,38,54,Bill and Kanishka discuss the need for a consistent metrics solution that aligns with both their father's needs and broader organizational goals. They emphasize the importance of defining metrics and establishing a common analytics database to track performance effectively.,Defining Metrics and Analytics,20:26.903,26:54.400,1226,1614,vi6TBTdOQ3bkW2b7ku6xtvbK,Tactical Goals,"Kanishka Rao emphasized the importance of defining time frames and baseline data for measuring impacts, indicating a structured approach to achieving specific metrics related to exacerbations and hospitalizations.",Participating,"Kanishka Rao actively engaged in goal setting by discussing the importance of time frames, defining metrics, and emphasizing the significance of baseline data for measuring impacts.",Goal Setting,https://embed.api.video/vod/vi6TBTdOQ3bkW2b7ku6xtvbK#;t=1226
1,2,65,68,"Kanishka expresses the need for clarity on the population to be used for metrics calculations. Bill agrees to take the lead on proposing a population definition for their metrics, ensuring stakeholder input and agreement.",Population Definition for Metrics,31:14.914,32:15.470,1874,1935,vi6TBTdOQ3bkW2b7ku6xtvbK,Tactical Goals,"Kanishka Rao is emphasizing the need for someone to take responsibility for determining the population to be used, which indicates a focused initiative that aligns with achieving a broader strategic aim.",Participating,"Kanishka Rao is actively engaged in goal setting by suggesting that someone needs to take responsibility for determining the population to be used, indicating involvement in the decision-making process.",Goal Setting,https://embed.api.video/vod/vi6TBTdOQ3bkW2b7ku6xtvbK#;t=1874
2,3,78,84,Kanishka and Bill discuss the importance of using a common definition for pilot and control populations across different teams. They agree on the need for alignment in metrics definitions to ensure consistency in their evaluations.,Aligning Metrics Definitions,35:02.570,37:31.346,2102,2251,vi6TBTdOQ3bkW2b7ku6xtvbK,Tactical Goals,"Kanishka Rao emphasized the importance of using a common definition for pilot and control populations across different activities, indicating a collaborative approach to establishing specific goals that align with the overall objectives of the team.",Participating,"Kanishka Rao actively engaged in goal setting by discussing the importance of using a common definition for pilot and control population, indicating a collaborative approach to establishing goals across different activities.",Goal Setting,https://embed.api.video/vod/vi6TBTdOQ3bkW2b7ku6xtvbK#;t=2102
3,4,112,118,"Kanishka outlines the different levels of tasks he is managing, including executive responsibilities and individual contributions. Bill emphasizes the need for transparency and accountability in these roles to ensure effective management.",Managing Executive Responsibilities,47:26.229,49:13.090,2846,2953,vi6TBTdOQ3bkW2b7ku6xtvbK,Not Detected,"The discussion revolves around the potential impact of an intern and transparency issues, but does not involve any specific goal setting or planning activities.",Not Participating,Kanishka Rao is discussing the potential impact of an intern but is not actively contributing to defining or planning specific goals.,Goal Setting,https://embed.api.video/vod/vi6TBTdOQ3bkW2b7ku6xtvbK#;t=2846
4,5,181,186,"Kanishka identifies the need for a chief of staff to help manage his workload and responsibilities. Bill supports this idea, suggesting that hiring someone could help alleviate pressure and allow for better focus on executive tasks.",Need for Chief of Staff,01:16:05.344,01:18:55.814,4565,4735,vi6TBTdOQ3bkW2b7ku6xtvbK,Not Detected,"The conversation primarily revolves around Kanishka Rao's current responsibilities and challenges, as well as discussions about staffing and support, without any specific goal setting or defining of objectives.",Not Participating,Kanishka Rao is discussing his current responsibilities and challenges but is not actively contributing to the goal setting process or defining specific goals.,Goal Setting,https://embed.api.video/vod/vi6TBTdOQ3bkW2b7ku6xtvbK#;t=4565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,1,11,11,"Kanishka outlines the priorities for the month, emphasizing the need to focus on selling to health systems and preparing for upcoming conferences, which initiates a discussion on resource allocation and urgency.",Monthly Priorities Discussion,01:00.750,05:47.845,60,347,vi1GPAuvhakg74TUpNoMMUvc,Strategic Decision,"Kanishka Rao is outlining the priorities for the month, emphasizing the need to focus on selling to health systems and developing strategies for the SUa offering, which aligns with overarching organizational goals.",Participating,"Kanishka Rao is actively discussing priorities, outlining strategies, and soliciting feedback from others, indicating engagement in the decision-making process.",Decision Making,https://embed.api.video/vod/vi1GPAuvhakg74TUpNoMMUvc#;t=60
237,2,12,12,"Sriram questions the prioritization of GDMT improvement over SBIR, highlighting the time-sensitive nature of the tasks and prompting a reevaluation of priorities.",Prioritization of Tasks,05:47.949,07:24.213,347,444,vi1GPAuvhakg74TUpNoMMUvc,Tactical Decision,"Sriram Krishnan is questioning the prioritization of tasks based on their deadlines and impact, specifically weighing the importance of GDMT improvement against SBIR, which indicates a decision-making process about resource allocation and task prioritization within the team's objectives.",Not Participating,Kanishka Rao did not contribute to the discussion or decision-making process during this interaction.,Decision Making,https://embed.api.video/vod/vi1GPAuvhakg74TUpNoMMUvc#;t=347
238,3,36,36,"Bill expresses concerns about the urgency of GDMT improvement versus securing the AZ contract, leading to a consensus on prioritizing immediate tasks to meet deadlines.",Urgency of Contractual Obligations,20:04.073,21:53.173,1204,1313,vi1GPAuvhakg74TUpNoMMUvc,Strategic Decision,"Bill Landi is discussing the prioritization of obtaining the AZ contract and the associated risks, indicating a high-level decision that impacts the organization's strategic direction and resource allocation.",Not Participating,Kanishka Rao is not mentioned in the discussion and does not contribute to the decision-making process.,Decision Making,https://embed.api.video/vod/vi1GPAuvhakg74TUpNoMMUvc#;t=1204
239,4,44,44,"Kanishka emphasizes the importance of securing budget allocation from AZ, which is critical for future contracts, leading to a discussion on the implications of prioritizing this task.",Budget Allocation Importance,24:19.113,24:41.183,1459,1481,vi1GPAuvhakg74TUpNoMMUvc,Strategic Decision,"Kanishka Rao is highlighting the importance of securing the budget to enable signing a large contract, which indicates a focus on long-term goals and resource allocation for future fundraising efforts.",Participating,"Kanishka Rao is discussing the implications of the budget on future contracts and fundraising, actively contributing to the decision-making process.",Decision Making,https://embed.api.video/vod/vi1GPAuvhakg74TUpNoMMUvc#;t=1459


In [62]:
video_selector_only_ids = [item['videoId'] for item in video_selector] 

rows_updated = mark_old_latest_moment(video_selector_only_ids, activity_type_selector)
print(f"Found {rows_updated} old moments and set them to latest = False")
final_df = add_new_moments(final_df)
test  = (len(final_df))
print(f"{test} moments data has been added to the database")
print("Moments data has been added to the database")
debug = expand_moments_to_segments(final_df)
print("Moments Segement data has been added to the database")
debug.head()



Executing Query on Database: UPDATE public.moments SET latest = FALSE WHERE video_api_id IN :videoids AND activity IN :activities AND latest = TRUE
Found 129 old moments and set them to latest = False
Data successfully written to database
241 moments data has been added to the database
Moments data has been added to the database
Data successfully written to database
Moments Segement data has been added to the database


Unnamed: 0,id,moments_id,segment_id,video_api_id
0,3b4a1cd0-a796-461e-8b0b-9b40aec20d8e,cf53d0ea-0ac7-4f90-93fa-657983742243,vi6TBTdOQ3bkW2b7ku6xtvbK38,vi6TBTdOQ3bkW2b7ku6xtvbK
1,9c326cca-4550-4266-8c9b-09fb47ad5231,cf53d0ea-0ac7-4f90-93fa-657983742243,vi6TBTdOQ3bkW2b7ku6xtvbK39,vi6TBTdOQ3bkW2b7ku6xtvbK
2,b888ad6e-5a17-467f-9ef6-063a36e888b1,cf53d0ea-0ac7-4f90-93fa-657983742243,vi6TBTdOQ3bkW2b7ku6xtvbK40,vi6TBTdOQ3bkW2b7ku6xtvbK
3,e4a62708-4c16-4777-9f94-7b6f46aaf8b2,cf53d0ea-0ac7-4f90-93fa-657983742243,vi6TBTdOQ3bkW2b7ku6xtvbK41,vi6TBTdOQ3bkW2b7ku6xtvbK
4,eefa81a2-5c02-46e8-b6ad-ec0b5c0b7520,cf53d0ea-0ac7-4f90-93fa-657983742243,vi6TBTdOQ3bkW2b7ku6xtvbK42,vi6TBTdOQ3bkW2b7ku6xtvbK


: 