# Voice AI Call Analysis
This notebook processes a customer call recording to extract:
- Transcript
- Speaker diarization
- Call sentiment
- One actionable business insight
- Bonus: identify sales rep vs customer


In [None]:
# Colab: install required packages. Run this cell first.
!apt-get update -qq
!apt-get install -y -qq ffmpeg  # audio handling
# Python packages
!pip install -q yt-dlp openai-whisper resemblyzer transformers torch torchvision torchaudio librosa pydub scikit-learn nltk


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.1/177.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

## Step 1: Install dependencies & download test audio
We use `yt-dlp` to download the audio from YouTube and convert it to 16kHz mono WAV for speech processing.


In [None]:
# (1) Download the test file and convert to 16k mono WAV
YOUTUBE_URL = "https://www.youtube.com/watch?v=4ostqJD3Psc"  # assignment file

# download audio using yt-dlp
!yt-dlp -x --audio-format wav --output "call.%(ext)s" "{YOUTUBE_URL}"

# convert to 16kHz mono and do a basic denoise/normalize pass
!ffmpeg -y -i call.wav -ar 16000 -ac 1 -af "highpass=f=200, lowpass=f=3000, dynaudnorm" call_16k.wav
# optional further denoise (afftdn is ffmpeg's spectral denoiser)
!ffmpeg -y -i call_16k.wav -af afftdn call_clean.wav || true


[youtube] Extracting URL: https://www.youtube.com/watch?v=4ostqJD3Psc
[youtube] 4ostqJD3Psc: Downloading webpage
[youtube] 4ostqJD3Psc: Downloading tv simply player API JSON
[youtube] 4ostqJD3Psc: Downloading tv client config
[youtube] 4ostqJD3Psc: Downloading player b66835e2-main
[youtube] 4ostqJD3Psc: Downloading tv player API JSON
[info] 4ostqJD3Psc: Downloading 1 format(s): 251
[download] Sleeping 2.00 seconds as required by the site...
[download] Destination: call.webm
[K[download] 100% of    1.99MiB in [1;37m00:00:00[0m at [0;32m7.16MiB/s[0m
[ExtractAudio] Destination: call.wav
Deleting original file call.webm (pass -k to keep)
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --

## Step 2: Transcription
We use the Whisper **tiny** model (fast, robust to noisy audio) to generate transcript text.


In [None]:

# (2) Transcribe with Whisper (use tiny for speed). Comments included.
import whisper
model = whisper.load_model("tiny")   # tiny = fastest; swap to "base" if you have GPU/time
result = model.transcribe("call_clean.wav", language="en")  # returns segments with start/end/text
segments = result["segments"]  # list of dicts: {'id','seek','start','end','text',...}

# Show a few segments to inspect
for s in segments[:5]:
    print(f"{s['start']:.2f}-{s['end']:.2f}: {s['text'].strip()}")


100%|█████████████████████████████████████| 72.1M/72.1M [00:01<00:00, 56.2MiB/s]


0.00-11.78: Thank you for calling me son. My name is Lauren. Can I have your name?
11.78-16.12: Yes, my name is John Smith. Thank you, John. How can I help you?
16.12-20.42: I was just calling about as she how much it would cost to update the map in my car.
20.42-24.06: I'd be happy to help you with that today. Did you receive a mail or from us?
24.06-26.56: I did. Do you need the customer number?


## Step 3: Speaker Diarization
We use Resemblyzer embeddings + clustering to separate different speakers in the conversation.


In [None]:
# (3) Lightweight diarization: Resemblyzer embeddings + AgglomerativeClustering
import numpy as np
from resemblyzer import VoiceEncoder, preprocess_wav
from scipy.io import wavfile
from sklearn.cluster import AgglomerativeClustering

sr, audio = wavfile.read("call_clean.wav")
# make sure audio is float32 in range [-1,1]
if audio.dtype != np.float32:
    audio = audio.astype(np.float32) / np.iinfo(audio.dtype).max

# sliding windows to compute speaker embeddings
encoder = VoiceEncoder()  # pretrained voice encoder
window_s = 1.5  # 1.5s windows
step_s = 0.75   # 50% overlap
windows = []
times = []
n_samples = len(audio)
for start in np.arange(0, max(0, n_samples/sr - window_s + 1e-6), step_s):
    s = int(start * sr)
    e = int(min(n_samples, (start + window_s) * sr))
    windows.append(audio[s:e])
    times.append((start, (e / sr)))

# compute embeddings (this is the main cost)
embeds = [encoder.embed_utterance(w) for w in windows]  # each is 256-d

# cluster into 2 speakers (assignment expects 2 persons)
X = np.vstack(embeds)
clustering = AgglomerativeClustering(n_clusters=2).fit(X)
labels = clustering.labels_  # for each window

# helper to determine majority label for arbitrary time range
def speaker_for_segment(seg_start, seg_end):
    idxs = [i for i,(a,b) in enumerate(times) if not (b <= seg_start or a >= seg_end)]
    if not idxs:
        return None
    return int(np.bincount(labels[idxs]).argmax())

# attach speaker label to each whisper segment
for seg in segments:
    seg['speaker'] = speaker_for_segment(seg['start'], seg['end'])

# quick check
for s in segments[:8]:
    print(f"{s['start']:.1f}-{s['end']:.1f} [{s['speaker']}]: {s['text']}")


Loaded the voice encoder model on cpu in 0.01 seconds.
0.0-11.8 [1]:  Thank you for calling me son. My name is Lauren. Can I have your name?
11.8-16.1 [0]:  Yes, my name is John Smith. Thank you, John. How can I help you?
16.1-20.4 [0]:  I was just calling about as she how much it would cost to update the map in my car.
20.4-24.1 [0]:  I'd be happy to help you with that today. Did you receive a mail or from us?
24.1-26.6 [0]:  I did. Do you need the customer number?
26.6-27.6 [0]:  Yes, please.
27.6-30.6 [0]:  Okay. It's 15243.
30.6-33.6 [0]:  Thank you and the year making model of your vehicle.


In [None]:
from collections import defaultdict
import re

# talk time per speaker
speaker_time = defaultdict(float)
for seg in segments:
    if seg.get('speaker') is None: continue
    dur = seg['end'] - seg['start']
    speaker_time[seg['speaker']] += dur
total_speech = sum(speaker_time.values()) or 1.0
talk_time_ratio = {int(k): (v/total_speech)*100 for k,v in speaker_time.items()}

# number of questions: count '?' and lines that look like questions
q_count = 0
for seg in segments:
    text = seg['text'].strip()
    q_count += text.count('?')
    # also check for typical interrogative words + punctuationless endings
    q_count += 0  # keep conservative (we already count '?')

# longest monologue: merge consecutive segments of same speaker (allow small gaps)
monologues = []
if segments:
    cur_sp = segments[0]['speaker']
    cur_start = segments[0]['start']
    cur_end = segments[0]['end']
    for seg in segments[1:]:
        if seg['speaker'] == cur_sp and (seg['start'] - cur_end) <= 1.0:
            cur_end = seg['end']
        else:
            monologues.append((cur_sp, cur_start, cur_end, cur_end-cur_start))
            cur_sp = seg['speaker']; cur_start = seg['start']; cur_end = seg['end']
    monologues.append((cur_sp, cur_start, cur_end, cur_end-cur_start))
# get longest
if monologues:
    longest = max(monologues, key=lambda x: x[3])
else:
    longest = (None,0,0,0)

# print metrics
print("Talk-time ratio (%) per speaker:", talk_time_ratio)
print("Number of questions (detected via '?'):", q_count)
print("Longest monologue:", {"speaker": longest[0], "duration_s": longest[3]})


Talk-time ratio (%) per speaker: {1: 9.935897435897436, 0: 90.06410256410257}
Number of questions (detected via '?'): 7
Longest monologue: {'speaker': 0, 'duration_s': 106.78}


## Step 4: Sentiment Analysis
We analyze each speaker's utterances using a transformer-based sentiment model.


In [None]:
# (5) Sentiment with transformers pipeline (SST-2 model)
from transformers import pipeline
sent_pipe = pipeline("sentiment-analysis")  # distilbert-finetuned-sst2 by default

# collect text per speaker
from collections import defaultdict
speaker_text = defaultdict(str)
for seg in segments:
    sp = seg.get('speaker')
    if sp is None: continue
    speaker_text[sp] += " " + seg['text']

# get sentiment for each speaker (truncate to 4000 chars to be safe)
speaker_sentiment = {}
for sp, text in speaker_text.items():
    shortened = text.strip()[:4000]
    if shortened:
        r = sent_pipe(shortened)
        speaker_sentiment[sp] = r  # list of dicts from pipeline

# derive overall call sentiment by counting positive/negative
pos = neg = 0
for sp, res in speaker_sentiment.items():
    lab = res[0]['label']
    if lab.upper().startswith("POS"):
        pos += 1
    else:
        neg += 1
overall_sentiment = "positive" if pos>neg else ("negative" if neg>pos else "neutral")

print("Per-speaker sentiment:", speaker_sentiment)
print("Overall call sentiment:", overall_sentiment)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


Per-speaker sentiment: {1: [{'label': 'POSITIVE', 'score': 0.9980979561805725}], 0: [{'label': 'NEGATIVE', 'score': 0.812537670135498}]}
Overall call sentiment: neutral


## Step 5: Actionable Insight
We extract one key insight based on customer sentiment and topic.
Bonus: Identify sales rep vs customer based on talk time + question patterns.


In [None]:
# (6) Simple heuristic insight + identify sales rep vs customer (bonus)
# Heuristics:
# - Sales rep tends to talk more & ask more questions.
# - If one speaker has >65% talk time -> rep dominated.

# identify assumed rep & customer
if len(talk_time_ratio) >= 2:
    rep_label = max(talk_time_ratio, key=talk_time_ratio.get)
    cust_label = min(talk_time_ratio, key=talk_time_ratio.get)
else:
    rep_label = list(talk_time_ratio.keys())[0] if talk_time_ratio else None
    cust_label = None

# insight rules
insight = ""
if talk_time_ratio.get(rep_label,0) > 65:
    insight = "Sales rep dominated the call — try to ask more open questions and let the customer speak."
elif q_count < 3:
    insight = "Few questions were asked. Increase discovery/open-ended questions to learn customer needs."
elif overall_sentiment == "negative":
    insight = "Call has negative sentiment. Coach on objection-handling & empathy."
else:
    insight = "Balanced call. Continue asking open ended questions and confirm next steps."

# final output object
result = {
    "talk_time_ratio": talk_time_ratio,
    "questions_detected": q_count,
    "longest_monologue": {"speaker": longest[0], "duration_s": longest[3]},
    "overall_sentiment": overall_sentiment,
    "per_speaker_sentiment": speaker_sentiment,
    "insight": insight,
    "assumed_sales_rep": int(rep_label) if rep_label is not None else None,
    "assumed_customer": int(cust_label) if cust_label is not None else None
}

import json
with open("call_analysis.json","w") as f:
    json.dump(result, f, indent=2)
print(json.dumps(result, indent=2))


{
  "talk_time_ratio": {
    "1": 9.935897435897436,
    "0": 90.06410256410257
  },
  "questions_detected": 7,
  "longest_monologue": {
    "speaker": 0,
    "duration_s": 106.78
  },
  "overall_sentiment": "neutral",
  "per_speaker_sentiment": {
    "1": [
      {
        "label": "POSITIVE",
        "score": 0.9980979561805725
      }
    ],
    "0": [
      {
        "label": "NEGATIVE",
        "score": 0.812537670135498
      }
    ]
  },
  "insight": "Sales rep dominated the call \u2014 try to ask more open questions and let the customer speak.",
  "assumed_sales_rep": 0,
  "assumed_customer": 1
}


# ✅ Final Results
- Transcript: ✔️
- Speakers separated: ✔️
- Sentiment detected: ✔️
- Actionable insight: ✔️
- Sales rep vs customer: ✔️
