In [1]:
!pip install openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m40.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch-

In [2]:
# !pip install transformers
# !pip install sentencepiece
!pip install pyannote-audio
!pip install pydub

Collecting pyannote-audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote-audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote-audio)
  Downloading lightning-2.5.1-py3-none-any.whl.metadata (39 kB)
Collecting omegaconf<3.0,>=2.1 (from pyannote-audio)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyannote.core>=5.0.0 (from pyannote-audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote-audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote-audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote-audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os
import torch
import whisper
import numpy as np
import pandas as pd
import json
import librosa
from pyannote.audio import Pipeline
from pyannote.core import Segment
import datetime
import gc
from typing import Dict, List, Tuple, Optional

class AudioAnnotator:
    def __init__(
        self,
        whisper_model_size: str = "medium",
        huggingface_token: str = "hf_***********************************",
        device: str = None
    ):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        print(f"Loading Whisper model ({whisper_model_size})...")
        self.whisper_model = whisper.load_model(whisper_model_size, device=self.device)
        print("Whisper model loaded successfully")

        if huggingface_token:
            try:
                print("Loading pyannote.audio speaker diarization pipeline...")
                self.diarization_pipeline = Pipeline.from_pretrained(
                    "pyannote/speaker-diarization-3.1",
                    use_auth_token=huggingface_token
                )
                self.diarization_pipeline.to(torch.device(self.device))
                self.diarization_available = True
                print("Speaker diarization pipeline loaded successfully")
            except Exception as e:
                print(f"Error loading speaker diarization: {e}")
                self.diarization_available = False
        else:
            print("No HuggingFace token provided. Speaker diarization will not be available.")
            self.diarization_available = False

    def transcribe_audio(self, audio_path: str, language: str = None) -> Dict:
        print(f"Transcribing {audio_path}...")
        transcription = self.whisper_model.transcribe(
            audio_path,
            language=language,
            word_timestamps=True,
            verbose=False
        )
        print(f"Transcription complete: {len(transcription['segments'])} segments found")
        return transcription

    def perform_diarization(self, audio_path: str) -> Optional[Dict]:
        if not self.diarization_available:
            print("Speaker diarization is not available")
            return None

        print(f"Performing speaker diarization on {audio_path}...")
        diarization = self.diarization_pipeline(audio_path)
        speaker_segments = [
            {"speaker": speaker, "start": turn.start, "end": turn.end}
            for turn, _, speaker in diarization.itertracks(yield_label=True)
        ]
        print(f"Diarization complete: found {len(set(s['speaker'] for s in speaker_segments))} speakers")
        return {
            "speakers": list(set(s["speaker"] for s in speaker_segments)),
            "segments": speaker_segments
        }

    def clear_gpu_cache(self):
        torch.cuda.empty_cache()
        gc.collect()

    def classify_speakers(self, diarization_result: Dict) -> Dict:
        if not diarization_result:
            return {}

        speaker_durations = {}
        for seg in diarization_result["segments"]:
            duration = seg["end"] - seg["start"]
            speaker_durations[seg["speaker"]] = speaker_durations.get(seg["speaker"], 0) + duration

        sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)
        if len(sorted_speakers) < 2:
            print("Only one speaker detected")
            return {sorted_speakers[0][0]: "AGENT"}

        speaker_mapping = {
            sorted_speakers[0][0]: "AGENT",
            sorted_speakers[1][0]: "CLIENT"
        }
        print(f"Speaker classification: {speaker_mapping}")
        return speaker_mapping

    def align_transcription_with_speakers(self, transcription: Dict, diarization_result: Optional[Dict], speaker_mapping: Optional[Dict]) -> List[Dict]:
        aligned_segments = []

        if not diarization_result:
            for segment in transcription["segments"]:
                aligned_segments.append({
                    "start": segment["start"],
                    "end": segment["end"],
                    "text": segment["text"],
                    "speaker": "UNKNOWN",
                    "speaker_type": "UNKNOWN",
                    "words": segment.get("words", [])
                })
            return aligned_segments

        for segment in transcription["segments"]:
            segment_start = segment["start"]
            segment_end = segment["end"]
            segment_text = segment["text"]
            overlapping = {}

            for spk_seg in diarization_result["segments"]:
                overlap_start = max(segment_start, spk_seg["start"])
                overlap_end = min(segment_end, spk_seg["end"])
                if overlap_end > overlap_start:
                    overlap_duration = overlap_end - overlap_start
                    overlapping[spk_seg["speaker"]] = overlapping.get(spk_seg["speaker"], 0) + overlap_duration

            if overlapping:
                most_likely_speaker = max(overlapping, key=overlapping.get)
                speaker_type = speaker_mapping.get(most_likely_speaker, "UNKNOWN")
            else:
                most_likely_speaker = "UNKNOWN"
                speaker_type = "UNKNOWN"

            aligned_segments.append({
                "start": segment_start,
                "end": segment_end,
                "text": segment_text,
                "speaker": most_likely_speaker,
                "speaker_type": speaker_type,
                "words": segment.get("words", [])
            })

        return aligned_segments

    def create_annotations(self, aligned_segments: List[Dict]) -> Dict:
        full_text = " ".join(s["text"] for s in aligned_segments)
        annotations = []
        position = 0

        for segment in aligned_segments:
            text = segment["text"]
            speaker_type = segment["speaker_type"]

            annotations.append({
                "start": position,
                "end": position + len(text),
                "text": text,
                "category": "SPEAKER_TYPE",
                "value": speaker_type,
                "metadata": {
                    "start_time": segment["start"],
                    "end_time": segment["end"],
                    "speaker_id": segment["speaker"]
                }
            })
            position += len(text) + 1

        return {
            "text": full_text,
            "labels": annotations,
            "segments": aligned_segments
        }

    def create_simplified_transcript(self, annotation: Dict, output_path: str):
        with open(output_path, "w") as f:
            for segment in annotation["segments"]:
                speaker_type = segment["speaker_type"].lower()
                label = "agent" if speaker_type == "agent" else "client" if speaker_type == "client" else "unknown"
                f.write(f"{label}: {segment['text']}\n\n")

    def process_audio(self, audio_path: str, output_dir: str = None, language: str = None):
        output_dir = output_dir or os.path.dirname(os.path.abspath(audio_path))
        os.makedirs(output_dir, exist_ok=True)

        base_name = os.path.splitext(os.path.basename(audio_path))[0]

        transcription = self.transcribe_audio(audio_path, language)
        diarization_result = self.perform_diarization(audio_path)
        speaker_mapping = self.classify_speakers(diarization_result) if diarization_result else {}

        aligned_segments = self.align_transcription_with_speakers(transcription, diarization_result, speaker_mapping)
        annotation = self.create_annotations(aligned_segments)

        with open(os.path.join(output_dir, f"{base_name}_annotation.json"), "w") as f:
            json.dump(annotation, f, indent=2)

        with open(os.path.join(output_dir, f"{base_name}_transcription.json"), "w") as f:
            json.dump(transcription, f, indent=2)

        detailed_path = os.path.join(output_dir, f"{base_name}_detailed_transcript.txt")
        self.create_detailed_transcript(annotation, detailed_path)

        simple_path = os.path.join(output_dir, f"{base_name}_simplified_transcript.txt")
        self.create_simplified_transcript(annotation, simple_path)
        print(f"Simplified transcript saved to: {simple_path}")

        print("Processing complete!")
        return os.path.join(output_dir, f"{base_name}_annotation.json")

    def create_detailed_transcript(self, annotation, output_path):
        with open(output_path, "w") as f:
            f.write("TRANSCRIPT\n")
            f.write("==========\n\n")
            for segment in annotation["segments"]:
                start = str(datetime.timedelta(seconds=int(segment["start"])))
                end = str(datetime.timedelta(seconds=int(segment["end"])))
                speaker = segment["speaker_type"]
                text = segment["text"]
                f.write(f"[{start} - {end}] {speaker}: {text}\n\n")

    def get_full_transcript_text(self, annotation: Dict) -> str:
        return " ".join(segment["text"] for segment in annotation["segments"])


In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
!apt-get install nodejs npm
!npm install express cors axios ngrok @huggingface/transformers

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  gyp javascript-common libc-ares2 libjs-events libjs-highlight.js
  libjs-inherits libjs-is-typedarray libjs-psl libjs-source-map
  libjs-sprintf-js libjs-typedarray-to-buffer libnode-dev libnode72
  libnotify-bin libnotify4 libuv1-dev node-abab node-abbrev node-agent-base
  node-ansi-regex node-ansi-styles node-ansistyles node-aproba node-archy
  node-are-we-there-yet node-argparse node-arrify node-asap node-asynckit
  node-balanced-match node-brace-expansion node-builtins node-cacache
  node-chalk node-chownr node-clean-yaml-object node-cli-table node-clone
  node-color-convert node-color-name node-colors node-columnify
  node-combined-stream node-commander node-console-control-strings
  node-copy-concurrently node-core-util-is node-coveralls node-cssom
  node-cssstyle node-debug node-decompress-response node-defaults
  node-delayed-st

In [6]:
!pip install flask_cors
!pip install flask flask-cors pyngrok transformers --quiet

Collecting flask_cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Downloading flask_cors-5.0.1-py3-none-any.whl (11 kB)
Installing collected packages: flask_cors
Successfully installed flask_cors-5.0.1


In [None]:
!ngrok config add-authtoken AuthToken

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from flask import Flask, request, jsonify
from transformers import pipeline
from pyngrok import ngrok
import os
import json

app = Flask(__name__)

# Load once at server start
print("Loading models...")
annotator = AudioAnnotator(
    whisper_model_size="large",
    huggingface_token="hf_***********************************",  # Replace with your token
)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment"
)
print("Models loaded successfully.")

@app.route('/analyze', methods=['POST'])
def analyze():
    try:
        if 'audio' not in request.files:
            return jsonify({"error": "No audio file uploaded"}), 400

        audio_file = request.files['audio']
        audio_path = os.path.join('/content', audio_file.filename)
        audio_file.save(audio_path)

        # Transcribe & diarize
        annotation_path = annotator.process_audio(audio_path)

        with open(annotation_path, "r") as f:
            annotation = json.load(f)

        star_ratings = []
        dialogue_by_speaker = []

        for label in annotation["labels"]:
            text = label["text"].strip()
            speaker_type = label["value"]
            print(f"Analyzing text: {text} from {speaker_type}")  # 👈 Add this

            # translator = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
            # text_sentiment = translator(text)[0]["translation_text"]
            sentiment = sentiment_pipeline(text)[0]
            print(f"Sentiment result: {sentiment}")  # 👈 Add this too

            try:
                stars = int(sentiment["label"].split()[0])
                star_ratings.append(stars)
            except (ValueError, IndexError):
                continue

            dialogue_by_speaker.append(f"{speaker_type}: {text}")

        # Calculate average rating
        avg_rating = round(sum(star_ratings) / len(star_ratings), 2) if star_ratings else None

        # Build response
        response = {
            "rating": avg_rating,
            "labels": annotation["labels"],  # full labels as received
            "dialogue": dialogue_by_speaker
        }
        annotator.clear_gpu_cache()
        return jsonify(response)

        #jsonify({
        #     "status": "success",
        #     "average_star_rating": round(avg_rating, 2) if avg_rating else "N/A",
        #     "segments": annotation.get("segments", [])
        # })

    except Exception as e:
        return jsonify({"error": str(e)}), 500

public_url = ngrok.connect(5000).public_url
print(f"Flask server is live at: {public_url}")

app.run(port=5000)

In [None]:
from flask import Flask, request, jsonify
from transformers import pipeline
from pyngrok import ngrok
import os
import json

app = Flask(__name__)

# Load models once at server start
print("Loading models...")

# Assuming you already have this class
annotator = AudioAnnotator(
    whisper_model_size="medium",
    huggingface_token="hf_***********************************",  # Replace with your token
)

# Load sentiment model (tabularisai)
sentiment_pipeline = pipeline(
    "text-classification",
    model="tabularisai/multilingual-sentiment-analysis"
)
print("Models loaded successfully.")

# Map sentiment label to 0–5 star rating
label_to_rating = {
    "very negative": 0,
    "negative": 1,
    "slightly negative": 2,
    "neutral": 4,
    "positive": 4.5,
    "very positive": 5
}

@app.route('/analyze', methods=['POST'])
def analyze():
    try:
        if 'audio' not in request.files:
            return jsonify({"error": "No audio file uploaded"}), 400

        audio_file = request.files['audio']
        audio_path = os.path.join('/content', audio_file.filename)
        audio_file.save(audio_path)

        # Transcribe and diarize
        annotation_path = annotator.process_audio(audio_path)

        with open(annotation_path, "r") as f:
            annotation = json.load(f)

        star_ratings = []
        dialogue_by_speaker = []

        for label in annotation["labels"]:
            text = label["text"].strip()
            speaker_type = label["value"]
            print(f"Analyzing text: {text} from {speaker_type}")

            sentiment = sentiment_pipeline(text)[0]
            sentiment_label = sentiment["label"].lower()
            sentiment_score = sentiment["score"]

            print(f"Sentiment: {sentiment_label} (score: {sentiment_score:.2f})")

            stars = label_to_rating.get(sentiment_label, None)
            if stars is not None:
                star_ratings.append(stars)

            dialogue_by_speaker.append(f"{speaker_type}: {text}")

        avg_rating = round(sum(star_ratings) / len(star_ratings), 2) if star_ratings else None

        response = {
            "rating": avg_rating,
            "labels": annotation["labels"],
            "dialogue": dialogue_by_speaker
        }

        annotator.clear_gpu_cache()
        return jsonify(response)

    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Start server
public_url = ngrok.connect(5000).public_url
print(f"Flask server is live at: {public_url}")

app.run(port=5000)

Loading models...
Using device: cuda
Loading Whisper model (medium)...


100%|█████████████████████████████████████| 1.42G/1.42G [01:11<00:00, 21.5MiB/s]


Whisper model loaded successfully
Loading pyannote.audio speaker diarization pipeline...


config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

Speaker diarization pipeline loaded successfully


config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


Models loaded successfully.
Flask server is live at: https://eb01-34-87-123-103.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [15/Apr/2025 10:50:05] "[33mGET / HTTP/1.1[0m" 404 -


Transcribing /content/audio-1744714286336.mp3...
Detected language: English


100%|██████████| 11544/11544 [00:23<00:00, 498.53frames/s]


Transcription complete: 15 segments found
Performing speaker diarization on /content/audio-1744714286336.mp3...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Diarization complete: found 2 speakers
Speaker classification: {'SPEAKER_00': 'AGENT', 'SPEAKER_01': 'CLIENT'}
Simplified transcript saved to: /content/audio-1744714286336_simplified_transcript.txt
Processing complete!
Analyzing text: Thank you for calling Martha's Flowers Towne SST. Hello I'd like to order flowers from AGENT
Sentiment: very positive (score: 0.50)
Analyzing text: and I think you have what I'm looking for. I'd be happy to take care of your from CLIENT
Sentiment: neutral (score: 0.53)
Analyzing text: order may have your name please. Randall Thomas. Randall Thomas can you spell that from AGENT
Sentiment: neutral (score: 0.44)
Analyzing text: for me? Randall R-A-N-B-A-L-L Thomas T-H-O-M-A-S. Thank you for that information Randall from CLIENT
Sentiment: very positive (score: 0.38)
Analyzing text: may have your home or office number area code first. Area code 409 then 866-5088. from CLIENT
Sentiment: neutral (score: 0.66)
Analyzing text: That's 409-866-5088. Do you have a fa

INFO:werkzeug:127.0.0.1 - - [15/Apr/2025 10:52:16] "POST /analyze HTTP/1.1" 200 -


In [None]:
# from transformers import pipeline
# import json

# if __name__ == "__main__":
#     HUGGINGFACE_TOKEN = "hf_***********************************"

#     annotator = AudioAnnotator(
#         whisper_model_size="base",
#         huggingface_token=HUGGINGFACE_TOKEN
#     )

#     audio_file = "/content/WhatsApp Audio 2025-04-12 at 11.12.22_5237f6ee.mp3"
#     annotation_path = annotator.process_audio(
#         audio_file,
#         output_dir="./annotation_output",
#         language=None
#     )

#     with open(annotation_path, "r") as f:
#         annotation = json.load(f)

#     # Load sentiment pipeline
#     sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

#     # Sentiment per segment (Agent only)
#     print("\nAgent-only Segment-wise Sentiment Analysis:\n")
#     for segment in annotation["segments"]:
#         speaker = segment["speaker_type"]
#         text = segment["text"]

#         # Skip non-agent speakers
#         if speaker.lower() != "agent":
#             continue
#         # translator = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
#         # text = translator(text)[0]["translation_text"]
#         sentiment = sentiment_pipeline(text)
#         print(f"{speaker}: {text}")
#         print(f"→ Sentiment: {sentiment[0]['label']} (Score: {sentiment[0]['score']:.2f})\n")


In [None]:
import torch

torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [None]:
!nvidia-smi


Tue Apr 15 09:31:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P0             27W /   70W |    6324MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from transformers import pipeline
import json

# Load sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
# sentiment_pipeline = pipeline(
#     "text-classification",
#     model="cardiffnlp/twitter-roberta-base-sentiment",
#     tokenizer="cardiffnlp/twitter-roberta-base-sentiment"
#     )
# Load the annotation JSON
with open(annotation_path, "r") as f:
    annotation = json.load(f)

# Collect all stars
star_ratings = []

for segment in annotation["segments"]:
    text = segment["text"]
    sentiment = sentiment_pipeline(text)[0]
    label = sentiment["label"]  # e.g., '4 stars'

    # Extract number from label
    stars = int(label.split()[0])
    star_ratings.append(stars)

# Calculate average
if star_ratings:
    avg_rating = sum(star_ratings) / len(star_ratings)
    print(f"\nAverage Star Rating: {avg_rating:.2f} ⭐")
else:
    print("\nNo ratings to calculate average.")



Device set to use cuda:0



Average Star Rating: 3.60 ⭐


In [None]:
!apt-get install nodejs npm
!mkdir llama-api && cd llama-api && npm init -y
!npm install express cors axios ngrok @huggingface/transformers

In [None]:
from transformers import pipeline
import json

# Load sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Load the annotation JSON
with open(annotation_path, "r") as f:
    annotation = json.load(f)

# Collect ratings
all_star_ratings = []
agent_star_ratings = []

for segment in annotation["segments"]:
    text = segment["text"]
    speaker = segment["speaker_type"]

    sentiment = sentiment_pipeline(text)[0]
    label = sentiment["label"]  # e.g., '4 stars'

    # Extract number from label
    stars = int(label.split()[0])
    all_star_ratings.append(stars)

    if speaker.lower() == "agent":
        agent_star_ratings.append(stars)

# Calculate and print averages
if all_star_ratings:
    avg_all = sum(all_star_ratings) / len(all_star_ratings)
    print(f"\n⭐ Overall Average Star Rating: {avg_all:.2f}")

if agent_star_ratings:
    avg_agent = sum(agent_star_ratings) / len(agent_star_ratings)
    print(f"🧑‍💼 Agent Average Star Rating: {avg_agent:.2f}")
else:
    print("\nNo agent segments found for rating.")


Device set to use cuda:0



⭐ Overall Average Star Rating: 3.60
🧑‍💼 Agent Average Star Rating: 3.25


In [None]:
from transformers import pipeline
import json

class AudioAnnotator:
    def __init__(self, whisper_model_size="large", huggingface_token=None):
        # Initialize your AudioAnnotator logic here
        # This is just a placeholder
        pass

    def process_audio(self, audio_path, output_dir, language=None):
        # Replace this with actual processing logic
        # Return path to annotation JSON
        return f"{output_dir}/annotation.json"


if __name__ == "__main__":
    HUGGINGFACE_TOKEN = "hf_***********************************"  # Replace with your token

    # Initialize the annotator
    annotator = AudioAnnotator(
        whisper_model_size="large",
        huggingface_token=HUGGINGFACE_TOKEN
    )

    audio_file = "/content/WhatsApp Audio 2025-04-12 at 11.12.22_5237f6ee.mp3"
    annotation_path = annotator.process_audio(
        audio_file,
        output_dir="./annotation_output",
        language=None
    )

    # Load the annotation result
    with open(annotation_path, "r") as f:
        annotation = json.load(f)

    # Initialize pipelines
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
    sentiment_pipeline = pipeline(
        "text-classification",
        model="cardiffnlp/twitter-roberta-base-sentiment",
        tokenizer="cardiffnlp/twitter-roberta-base-sentiment"
    )

    # Sentiment summary
    sentiment_counts = {"LABEL_0": 0, "LABEL_1": 0, "LABEL_2": 0}
    sentiment_map = {
        "LABEL_0": "Negative",
        "LABEL_1": "Neutral",
        "LABEL_2": "Positive"
    }

    print("\n🔍 Agent-only Segment-wise Sentiment Analysis:\n")
    for segment in annotation["segments"]:
        speaker = segment["speaker_type"]
        text = segment["text"]

        if speaker.lower() != "agent":
            continue

        # Translate if Hindi text is present
        translated = translator(text)[0]["translation_text"]
        sentiment = sentiment_pipeline(translated)[0]

        sentiment_counts[sentiment["label"]] += 1

        print(f"{speaker}: {translated}")
        print(f"→ Sentiment: {sentiment_map[sentiment['label']]} (Score: {sentiment['score']:.2f})\n")

    # Summary
    print("\n📊 Agent Sentiment Summary:")
    for label, count in sentiment_counts.items():
        print(f"{sentiment_map[label]}: {count}")
