# Git Repository Bringup

In [21]:
!git version

git version 2.34.1
time: 107 ms (started: 2023-09-28 16:48:31 +00:00)


In [22]:
!git clone https://github.com/ramwtz/data-science-project.git

fatal: destination path 'data-science-project' already exists and is not an empty directory.
time: 107 ms (started: 2023-09-28 16:48:32 +00:00)


In [23]:
!ls

data-science-project  drive  sample_data
time: 112 ms (started: 2023-09-28 16:48:32 +00:00)


In [24]:
%cd data-science-project/

/content/data-science-project
time: 7.5 ms (started: 2023-09-28 16:48:32 +00:00)


In [25]:
!pwd

/content/data-science-project
time: 107 ms (started: 2023-09-28 16:48:32 +00:00)


In [26]:
!git switch dev/ram

Already on 'dev/ram'
Your branch is up to date with 'origin/dev/ram'.
time: 109 ms (started: 2023-09-28 16:48:32 +00:00)


In [27]:
!git pull

Already up to date.
time: 415 ms (started: 2023-09-28 16:48:32 +00:00)


In [28]:
%cd ..

/content
time: 3.55 ms (started: 2023-09-28 16:48:33 +00:00)


# Basic Bringup for Remote Envrionment

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 4.62 s (started: 2023-09-28 16:48:37 +00:00)


In [61]:
# Necessary packages for this nb
!pip install --quiet --upgrade -r '/content/data-science-project/dependencies.txt'
!pip install pocketsphinx

Collecting pocketsphinx
  Downloading pocketsphinx-5.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.2/29.2 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sounddevice (from pocketsphinx)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, pocketsphinx
Successfully installed pocketsphinx-5.0.2 sounddevice-0.4.6
time: 28.9 s (started: 2023-09-28 17:21:15 +00:00)


# Initialize Env

In [107]:
# Imports
import speech_recognition as sr
from vosk import Model, KaldiRecognizer
import os
import librosa as lbrs
from tqdm import tqdm
import pandas as pd
import json

# Configs
%load_ext autotime
tqdm.pandas()

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 6.47 ms (started: 2023-09-28 18:14:56 +00:00)


In [64]:
gen_data_dir = os.path.join('drive', 'MyDrive', 'project', 'data')
calls_dir = os.path.join(gen_data_dir, 'calls')
new_data_dir = os.path.join(gen_data_dir, 'new')
models_path = os.path.join('drive', 'MyDrive', 'model')
metadata_path = os.path.join(calls_dir, '911_first6sec', '911_metadata_6sec.csv')

time: 1.18 ms (started: 2023-09-28 17:24:44 +00:00)


In [33]:
def get_path_from_filename(audio_file_name):
    type_dir, file_name = audio_file_name.split('/')
    return os.path.join(calls_dir, type_dir, file_name)

time: 578 µs (started: 2023-09-28 16:49:20 +00:00)


In [34]:
def create_audio_series(audio_file_name):
    audio_file_path = get_path_from_filename(audio_file_name)
    data, sample_rate = lbrs.load(audio_file_path)
    return data, sample_rate

time: 637 µs (started: 2023-09-28 16:49:20 +00:00)


# Fetching the Transcription
We're going to you the SpeechRecognition library to transform the audio to test to use as features for our model training

## Defs

In [52]:
r = sr.Recognizer()

time: 586 µs (started: 2023-09-28 17:19:03 +00:00)


In [223]:
def get_audio(audio_file_path, filter_noise=False):
  with sr.AudioFile(audio_file_path) as src:
    if filter_noise:
      r.adjust_for_ambient_noise(src, duration=0.5)
    return r.record(src)

time: 7.3 ms (started: 2023-09-28 20:31:10 +00:00)


In [65]:
def recognize_vosk(audio, model=None):
  if model is None:
    model_path = os.path.join(models_path, 'vosk', 'eng_small')

  vosk_audio = audio.get_raw_data(convert_rate=16000, convert_width=2)

  #
  model = Model(model_path)
  recognizer = KaldiRecognizer(model, 16000)
  recognizer.AcceptWaveform(vosk_audio)
  return recognizer.FinalResult()

time: 948 µs (started: 2023-09-28 17:24:55 +00:00)


In [55]:
def get_path_from_filename(audio_file_name):
    type_dir, file_name = audio_file_name.split('/')
    return os.path.join(calls_dir, type_dir, file_name)

time: 662 µs (started: 2023-09-28 17:19:04 +00:00)


In [224]:
def transcribe_file(filename, filter_noise=False):
  path = get_path_from_filename(filename)
  audio = get_audio(path, filter_noise=filter_noise)

  try:
    g = r.recognize_google(audio)
  except:
    g = "UNABLE TO RECOGNIZE"

  s = r.recognize_sphinx(audio)

  w = r.recognize_whisper(audio)

  vt = recognize_vosk(audio)
  v = json.loads(vt)['text']

  return g, s, w, v

time: 20.5 ms (started: 2023-09-28 20:31:32 +00:00)


## Raw Audio
We will first try to decipher the audio as is using four different speach to text engines: Google, Sphinx, Vosk & Whisper (all engines that have free APIs). The transcription of the text is: "Ma'am, my pickup was stolen, I had to go find it. ahh"

### Aggregate Transcriptions

In [37]:
metadata_df = pd.read_csv(metadata_path)

time: 584 ms (started: 2023-09-28 16:53:20 +00:00)


In [190]:
manual_transcriptions = {
    '911_first6sec/call_8_0.wav': "um hello hello is everything okay",
    '911_first6sec/call_9_0.wav': 'yes um i need a police officer over here at whats going on',
    '911_first6sec/call_10_0.wav': "ma'am my pickup was stolen I had to go find it uhh",
    '911_first6sec/call_11_0.wav': "no ma'am i don't i don't have an emergency two police officers just at my house just now",
    '911_first6sec/call_15_0.wav': "I'm Dylon Peterson and i just killed two people",
    '911_first6sec/call_17_0.wav': "yeah we got a fire down here at on route seven right here at abington",
    '911_first6sec/call_20_0.wav': "are you out of the house yet no alright get out of the house we can't theres a",
    '911_first6sec/call_21_0.wav': "yes is this a police station yes it is may i help you yeah okay i put this is ridiculous",
    '911_first6sec/call_28_0.wav': "can you please send rescue to i think i'm",
    '911_first6sec/call_31_0.wav': "oh my god somebody is shooting at the sheriff's department at the sheriff department yes building?"
}

manual_transcriptions = pd.Series(manual_transcriptions)
manual_transcriptions = manual_transcriptions.rename('manual_transcriptions')
manual_transcriptions = pd.DataFrame(manual_transcriptions).reset_index()
manual_transcriptions = manual_transcriptions.rename({'index': 'filename'}, axis=1)

time: 15.2 ms (started: 2023-09-28 20:13:06 +00:00)


In [192]:
filenames = pd.Series(manual_transcriptions['filename'])

0     911_first6sec/call_8_0.wav
1     911_first6sec/call_9_0.wav
2    911_first6sec/call_10_0.wav
3    911_first6sec/call_11_0.wav
4    911_first6sec/call_15_0.wav
5    911_first6sec/call_17_0.wav
6    911_first6sec/call_20_0.wav
7    911_first6sec/call_21_0.wav
8    911_first6sec/call_28_0.wav
9    911_first6sec/call_31_0.wav
Name: filename, dtype: object

time: 14.9 ms (started: 2023-09-28 20:13:36 +00:00)


In [140]:
transcriptions_series = filenames.progress_apply(transcribe_file)
column_names = {0: 'google', 1: 'sphinx', 2: 'whisper', 3: 'vosk'}
transcriptions_list = transcriptions_series.values.tolist()
transcriptions = pd.DataFrame(transcriptions_list).rename(column_names, axis=1)

100%|██████████| 10/10 [03:46<00:00, 22.65s/it]


Unnamed: 0,google,sphinx,whisper,vosk
0,hello hello is everything okay,and while the al and then again okay go ahead,Hello. Is everything okay? On.,ah now i'm now is everything okay
1,UNABLE TO RECOGNIZE,he had an idea of a pound and a coil of oil an...,"Yes, I need a police officer over here. What'...",that i'm anita police officer over here the co...
2,damn my pickup was born I had to go find it,no wonder the political had invited laugh,"Ma'am, my cat is a person. I had it offended.",now my pick up on to go find that ah
3,UNABLE TO RECOGNIZE,i'd really a man am i to have an latent he cou...,I'm gonna go to the bathroom and close the le...,remember my mom i got my house personality
4,I'm done Peterson and I just killed two people,glendon beard and that is killed two people that,I'm Joan Pearson and that has killed two people.,i'm i'm down there and that it killed few people
5,yeah we got a fire down here it's on Route 7 r...,the little hard and long little doubt and let ...,"We've got a far down here, it's around 7, rig...",there we go too far down here at our or out th...
6,are you out of the house yet no or I get out o...,we have our dead and now or i didn't have reac...,Are you out of the house yet? No. Or I get ou...,i get out of our now i get out of as we can't to
7,UNABLE TO RECOGNIZE,whether you're completely hemmed and looking f...,"Yes, it is. Please return. Yes, it is. May he...",i can't help you okay i put is ridiculous
8,can you please send rescue to I think I'm,and i knew her out and adam had,Can you please send rescue to... I think I'm ...,you keep him back to cook i think i'm having
9,oh my God somebody shooting at the sheriff's d...,so why when you hear more than that but i'd ha...,"Oh my God, so many students at this year's de...",oh look at how many shooting here for partner ...


time: 3min 46s (started: 2023-09-28 18:31:17 +00:00)


In [194]:
assessment = pd.DataFrame({"filename": filenames})
assessment = pd.concat([assessment, transcriptions], axis=1)

Unnamed: 0,filename,google,sphinx,whisper,vosk
0,911_first6sec/call_8_0.wav,hello hello is everything okay,and while the al and then again okay go ahead,Hello. Is everything okay? On.,ah now i'm now is everything okay
1,911_first6sec/call_9_0.wav,UNABLE TO RECOGNIZE,he had an idea of a pound and a coil of oil an...,"Yes, I need a police officer over here. What'...",that i'm anita police officer over here the co...
2,911_first6sec/call_10_0.wav,damn my pickup was born I had to go find it,no wonder the political had invited laugh,"Ma'am, my cat is a person. I had it offended.",now my pick up on to go find that ah
3,911_first6sec/call_11_0.wav,UNABLE TO RECOGNIZE,i'd really a man am i to have an latent he cou...,I'm gonna go to the bathroom and close the le...,remember my mom i got my house personality
4,911_first6sec/call_15_0.wav,I'm done Peterson and I just killed two people,glendon beard and that is killed two people that,I'm Joan Pearson and that has killed two people.,i'm i'm down there and that it killed few people
5,911_first6sec/call_17_0.wav,yeah we got a fire down here it's on Route 7 r...,the little hard and long little doubt and let ...,"We've got a far down here, it's around 7, rig...",there we go too far down here at our or out th...
6,911_first6sec/call_20_0.wav,are you out of the house yet no or I get out o...,we have our dead and now or i didn't have reac...,Are you out of the house yet? No. Or I get ou...,i get out of our now i get out of as we can't to
7,911_first6sec/call_21_0.wav,UNABLE TO RECOGNIZE,whether you're completely hemmed and looking f...,"Yes, it is. Please return. Yes, it is. May he...",i can't help you okay i put is ridiculous
8,911_first6sec/call_28_0.wav,can you please send rescue to I think I'm,and i knew her out and adam had,Can you please send rescue to... I think I'm ...,you keep him back to cook i think i'm having
9,911_first6sec/call_31_0.wav,oh my God somebody shooting at the sheriff's d...,so why when you hear more than that but i'd ha...,"Oh my God, so many students at this year's de...",oh look at how many shooting here for partner ...


time: 18.6 ms (started: 2023-09-28 20:14:28 +00:00)


In [195]:
assessment = assessment.merge(manual_transcriptions, left_on='filename', right_on='filename')

Unnamed: 0,filename,google,sphinx,whisper,vosk,manual_transcriptions
0,911_first6sec/call_8_0.wav,hello hello is everything okay,and while the al and then again okay go ahead,Hello. Is everything okay? On.,ah now i'm now is everything okay,um hello hello is everything okay
1,911_first6sec/call_9_0.wav,UNABLE TO RECOGNIZE,he had an idea of a pound and a coil of oil an...,"Yes, I need a police officer over here. What'...",that i'm anita police officer over here the co...,yes um i need a police officer over here at wh...
2,911_first6sec/call_10_0.wav,damn my pickup was born I had to go find it,no wonder the political had invited laugh,"Ma'am, my cat is a person. I had it offended.",now my pick up on to go find that ah,ma'am my pickup was stolen I had to go find it...
3,911_first6sec/call_11_0.wav,UNABLE TO RECOGNIZE,i'd really a man am i to have an latent he cou...,I'm gonna go to the bathroom and close the le...,remember my mom i got my house personality,no ma'am i don't i don't have an emergency two...
4,911_first6sec/call_15_0.wav,I'm done Peterson and I just killed two people,glendon beard and that is killed two people that,I'm Joan Pearson and that has killed two people.,i'm i'm down there and that it killed few people,I'm Dylon Peterson and i just killed two people
5,911_first6sec/call_17_0.wav,yeah we got a fire down here it's on Route 7 r...,the little hard and long little doubt and let ...,"We've got a far down here, it's around 7, rig...",there we go too far down here at our or out th...,yeah we got a fire down here at on route seven...
6,911_first6sec/call_20_0.wav,are you out of the house yet no or I get out o...,we have our dead and now or i didn't have reac...,Are you out of the house yet? No. Or I get ou...,i get out of our now i get out of as we can't to,are you out of the house yet no alright get ou...
7,911_first6sec/call_21_0.wav,UNABLE TO RECOGNIZE,whether you're completely hemmed and looking f...,"Yes, it is. Please return. Yes, it is. May he...",i can't help you okay i put is ridiculous,yes is this a police station yes it is may i h...
8,911_first6sec/call_28_0.wav,can you please send rescue to I think I'm,and i knew her out and adam had,Can you please send rescue to... I think I'm ...,you keep him back to cook i think i'm having,can you please send rescue to i think i'm
9,911_first6sec/call_31_0.wav,oh my God somebody shooting at the sheriff's d...,so why when you hear more than that but i'd ha...,"Oh my God, so many students at this year's de...",oh look at how many shooting here for partner ...,oh my god somebody is shooting at the sheriff'...


time: 17.8 ms (started: 2023-09-28 20:14:30 +00:00)


### Assess Transcriptions

In [220]:
from difflib import SequenceMatcher

def similar(row):
    if row['google'] == 'UNABLE TO RECOGNIZE':
      gr = 0
    else:
      gr = SequenceMatcher(None, row['google'], row['manual_transcriptions']).ratio()
    sr = SequenceMatcher(None, row['sphinx'], row['manual_transcriptions']).ratio()
    wr = SequenceMatcher(None, row['whisper'], row['manual_transcriptions']).ratio()
    vr = SequenceMatcher(None, row['vosk'], row['manual_transcriptions']).ratio()
    return gr, sr, wr, vr

time: 1.14 ms (started: 2023-09-28 20:25:33 +00:00)


In [227]:
scores_series = assessment.progress_apply(similar, axis=1)
column_names = {0: 'google', 1: 'sphinx', 2: 'whisper', 3: 'vosk'}
scores_list = scores_series.values.tolist()
scores = pd.DataFrame(scores_list).rename(column_names, axis=1)
scores.mean()

100%|██████████| 10/10 [00:00<00:00, 321.87it/s]


google     0.625686
sphinx     0.334570
whisper    0.682378
vosk       0.586600
dtype: float64

time: 53 ms (started: 2023-09-28 20:35:18 +00:00)


We can see that whisper has the best score, followed by google and vosk, while sphinx lags behind.

## Clean Audio

### Aggregate Transcriptions

In [228]:
transcriptions_series = filenames.progress_apply(transcribe_file, filter_noise=True)
column_names = {0: 'google', 1: 'sphinx', 2: 'whisper', 3: 'vosk'}
transcriptions_list = transcriptions_series.values.tolist()
transcriptions = pd.DataFrame(transcriptions_list).rename(column_names, axis=1)

100%|██████████| 10/10 [03:56<00:00, 23.69s/it]

time: 3min 56s (started: 2023-09-28 20:36:40 +00:00)





In [229]:
assessment = pd.DataFrame({"filename": filenames})
assessment = pd.concat([assessment, transcriptions], axis=1)

time: 2.34 ms (started: 2023-09-28 20:40:58 +00:00)


In [230]:
assessment = assessment.merge(manual_transcriptions, left_on='filename', right_on='filename')

time: 6.9 ms (started: 2023-09-28 20:41:01 +00:00)


In [231]:
assessment

Unnamed: 0,filename,google,sphinx,whisper,vosk,manual_transcriptions
0,911_first6sec/call_8_0.wav,put on hello hello is everything okay,and while canal in the making of the coup,Hello. Hello. Is everything okay? I'm.,now i'm now it's everything okay,um hello hello is everything okay
1,911_first6sec/call_9_0.wav,what's going on,and i had had a war what owen adam had,I need a police officer over here at 7. What'...,anita a police officer of the here the corner ...,yes um i need a police officer over here at wh...
2,911_first6sec/call_10_0.wav,damn my pickup was born I had to go find it,my own mind at football has invited up,"Ma'am, my kekep was born. I had to go find it.",well my pick up on a go find that ah,ma'am my pickup was stolen I had to go find it...
3,911_first6sec/call_11_0.wav,UNABLE TO RECOGNIZE,i'd really a man am i to have an latent he cou...,I'm gonna go to the bathroom and close the le...,remember my mom i got my house personality,no ma'am i don't i don't have an emergency two...
4,911_first6sec/call_15_0.wav,I'm downstairs and I just killed two people,one delegate in the uphill to people that,I'm Don Pearson and I just killed two people.,i'm down here and that it killed few people,I'm Dylon Peterson and i just killed two people
5,911_first6sec/call_17_0.wav,on Route 7 right here in Abington,god that the loaded on little doubt and let it,"We've got a far down here, it's on Route 7, r...",the far down here at or out thousand right her...,yeah we got a fire down here at on route seven...
6,911_first6sec/call_20_0.wav,get out of the house yet no I get out of the h...,that i'd have now or i didn't have you read th...,You're out of the house yet? No. Or I get out...,the out of as you know i get out of as we can't,are you out of the house yet no alright get ou...
7,911_first6sec/call_21_0.wav,UNABLE TO RECOGNIZE,i have heard of klingon camp but little more k...,"Yes, it is a police station. Yes, it is. May ...",i can't say help you okay i put the ridiculous,yes is this a police station yes it is may i h...
8,911_first6sec/call_28_0.wav,please send rescue to I think I'm,and i do her out and adam had,Please send a rescue to I think I'm having a,that do too i think i'm heading,can you please send rescue to i think i'm
9,911_first6sec/call_31_0.wav,oh my God somebody shooting at the sheriff's d...,i'm alive than he wanted identify have been he...,"Oh my God, some of these children that they s...",oh my god how many hidden a partner at the sha...,oh my god somebody is shooting at the sheriff'...


time: 30.6 ms (started: 2023-09-28 20:41:08 +00:00)


### Assess Transcriptions

In [233]:
scores_clean_series = assessment.progress_apply(similar, axis=1)
column_names = {0: 'google', 1: 'sphinx', 2: 'whisper', 3: 'vosk'}
scores_clean_list = scores_clean_series.values.tolist()
scores_clean = pd.DataFrame(scores_clean_list).rename(column_names, axis=1)
scores_clean.mean()

100%|██████████| 10/10 [00:00<00:00, 247.60it/s]


google     0.610000
sphinx     0.319726
whisper    0.741748
vosk       0.587776
dtype: float64

time: 65.6 ms (started: 2023-09-28 20:42:06 +00:00)


We essentially got the same ranking, with google and sphinx actually worsening as a result of the cleaning, vosk improving margiannly and whisper improving substantially.

Because Google is unreliable in producing a transcription, we will procced with whisper and vosk and check some of the different models available.

### Checking vosk & whisper models

In [None]:
r.recognize_google(audio_clean)

'damn my pickup was born I had to go find it'

In [None]:
r.recognize_sphinx(audio_clean)

'my own mind at football has invited up'

In [None]:
r.recognize_whisper(audio_clean, language='english')

" Ma'am, my kekep was born. I had to go find it."

In [None]:
model_path = _os.path.join('model', 'vosk', 'eng_small')
vosk_audio = audio_clean.get_raw_data(convert_rate=16000, convert_width=2)

#
model = Model(model_path)
recognizer = KaldiRecognizer(model, 16000)
recognizer.AcceptWaveform(vosk_audio);
recognizer.FinalResult()

'{\n  "text" : "well my pick up on had to go find that"\n}'

we can see that google hasn't been affected by the atttempted cleaning, with vosk doing a lateral move, and sphinx is just shit.
whisper has seen marginal improvement, yet not enough to rival google, lets try this with some heavier models:

Now lets try with heavier models with the original audio:

In [None]:
%%timeit
r.recognize_whisper(audio, model='large', language='english')

  @numba.jit


22.6 s ± 388 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio, model='large', language='english')

" Ma'am, my pickup was stolen. I had to go find it. AGH!"

## VOSK

### regular audio / large model

In [None]:
%%timeit
vosk_rec(audio, 'eng_large')

35.9 s ± 13.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio, 'eng_large')

'{\n  "text" : "ma\'am my pickup was gone i had to go find it"\n}'

### regular audio / small model

In [None]:
%%timeit
vosk_rec(audio, 'eng_small')

2.04 s ± 16.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio, 'eng_small')

'{\n  "text" : "now my pick up on to go find that ah"\n}'

### regular audio / lgraph model

In [None]:
%%timeit
vosk_rec(audio, 'vosk-model-en-us-0.22-lgraph')

10.6 s ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio, 'vosk-model-en-us-0.22-lgraph')

'{\n  "text" : "ma\'am my pickup post gone i had to go find it"\n}'

### regular audio / giant model

In [None]:
%%timeit
vosk_rec(audio, 'vosk-model-en-us-0.42-gigaspeech')

48.3 s ± 2.45 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio, 'vosk-model-en-us-0.42-gigaspeech')

'{\n  "text" : "well my pickup was gone i had to go find that ah"\n}'

### clean audio / large model

In [None]:
%%timeit
vosk_rec(audio_clean, 'eng_large')

29.6 s ± 135 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio_clean, 'eng_large')

'{\n  "text" : "well my pickup was gone i had to go find it"\n}'

### clean audio / small model

In [None]:
%%timeit
vosk_rec(audio_clean, 'eng_small')

2.01 s ± 8.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio_clean, 'eng_small')

'{\n  "text" : "well my pick up on had to go find that ah"\n}'

### clean audio / lgraph model

In [None]:
%%timeit
vosk_rec(audio_clean, 'vosk-model-en-us-0.22-lgraph')

10.5 s ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio_clean, 'vosk-model-en-us-0.22-lgraph')

'{\n  "text" : "well my a post gone i to go find that"\n}'

### clean audio / giant model

In [None]:
%%timeit
vosk_rec(audio_clean, 'vosk-model-en-us-0.42-gigaspeech')

49.1 s ± 639 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
vosk_rec(audio_clean, 'vosk-model-en-us-0.42-gigaspeech')

'{\n  "text" : "well my pickup was gone i had to go find it"\n}'

## Whisper

### regular audio / tiny model

In [None]:
%%timeit
r.recognize_whisper(audio, model='tiny')

100%|█████████████████████████████████████| 72.1M/72.1M [00:10<00:00, 6.93MiB/s]


1.06 s ± 23.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio, model='tiny')

' Now my character postponed. I had it offended.'

### regular audio / base model

In [None]:
%%timeit
r.recognize_whisper(audio, model='base')

2.38 s ± 55.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio, model='base')

" Ma'am, my cat is a person. I had it offended."

### regular audio / small model

In [None]:
%%timeit
r.recognize_whisper(audio, model='small')

100%|███████████████████████████████████████| 461M/461M [00:53<00:00, 8.97MiB/s]


7.15 s ± 70.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio, model='small')

" Ma'am, my cat got postponed. I had to go find it."

### regular audio / medium model

In [None]:
%%timeit
r.recognize_whisper(audio, model='medium')

100%|█████████████████████████████████████| 1.42G/1.42G [03:08<00:00, 8.11MiB/s]


21.7 s ± 302 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio, model='medium')

" Ma'am, my paper was stolen. I had to go find it. Ah!"

### regular audio / large model

In [None]:
%%timeit
r.recognize_whisper(audio, model='large')

40.6 s ± 2.17 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio, model='large')

" Ma'am, my pickup was stolen. I had to go find it. AGH!"

### clean audio / tiny model

In [None]:
%%timeit
r.recognize_whisper(audio_clean, model='tiny')

3.77 s ± 1.24 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio_clean, model='tiny')

" then I'll check their post on I had it recorded"

### regular audio / base model

In [None]:
%%timeit
r.recognize_whisper(audio_clean, model='base')

2.59 s ± 31.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio_clean, model='base')

" Ma'am, my kekep was born. I had to go find it."

### regular audio / small model

In [None]:
%%timeit
r.recognize_whisper(audio_clean, model='small')

7.5 s ± 71.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio_clean, model='small')

" Ma'am, my kekka pushed on. I had to go find it."

### regular audio / medium model

In [None]:
%%timeit
r.recognize_whisper(audio_clean, model='medium')

22.5 s ± 94.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio_clean, model='medium')

" Ma'am, my pickup was stolen. I had to go find it. Ah!"

### regular audio / large model

In [None]:
%%timeit
r.recognize_whisper(audio_clean, model='large')

2min 26s ± 36.7 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r.recognize_whisper(audio_clean, model='large')

" Ma'am, my pickup was stolen. I had to go find it. Ahh!"

## lets try to reduce the noise

In [None]:
import noisereduce as nr
from scipy.io import wavfile

In [None]:
audio_file_32_path = _os.path.join('data', 'calls', '911_first6sec', 'call_32_0.wav')

In [None]:
# load data
rate, data = wavfile.read(audio_file_32_path)
print(rate, data)
# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate)
wavfile.write("mywav_reduced_noise.wav", rate, reduced_noise)

48000 [[    0     0]
 [    0     0]
 [    0     0]
 ...
 [10792 10799]
 [10601 10608]
 [10238 10243]]


MemoryError: Unable to allocate 129. GiB for an array with shape (288000, 60002) and data type float64

# Sentiment Analysis

### UTILS

In [None]:
from nltk.corpus import stopwords
import pandas as pd
import string
from nltk.tokenize import RegexpTokenizer

def tokenize(raw):
    tokenizer = RegexpTokenizer(r"\w+\'?\w*")
    return tokenizer.tokenize(raw)

def preprocess_words_fast(tokens):
    t = pd.Series(tokens).str.lower()
    t = t[~(t.isin(stopwords.words('english')) | t.isin(list(string.punctuation)))]
    return t.tolist()

In [None]:
raw = " Ma'am, my pickup was stolen. I had to go find it. Ahh!"
tokens = tokenize(raw)
words = preprocess_words_fast(tokens)
print(words)

["ma'am", 'pickup', 'stolen', 'go', 'find', 'ahh']


### vader

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
# dirty
sid.polarity_scores(raw)

{'neg': 0.259, 'neu': 0.741, 'pos': 0.0, 'compound': -0.5411}

In [None]:
# clean
sid.polarity_scores(' '.join(words))

{'neg': 0.39, 'neu': 0.61, 'pos': 0.0, 'compound': -0.4939}

### textblob

In [None]:
from textblob import TextBlob

In [None]:
# dirty
testimonial = TextBlob(raw)
print(testimonial.sentiment)

Sentiment(polarity=0.0, subjectivity=0.0)


In [None]:
# clean
testimonial = TextBlob(' '.join(words))
print(testimonial.sentiment)

Sentiment(polarity=0.0, subjectivity=0.0)


### Flair

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

In [None]:
classifier = TextClassifier.load('en-sentiment')

In [None]:
sentence = Sentence(raw)
classifier.predict(sentence)

# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

Sentence above is:  ['Sentence[16]: " Ma'am, my pickup was stolen. I had to go find it. Ahh!"'/'NEGATIVE' (0.9804)]


In [None]:
sentence = Sentence(' '.join(words))
classifier.predict(sentence)

# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

Sentence above is:  ['Sentence[6]: "ma'am pickup stolen go find ahh"'/'NEGATIVE' (0.9505)]


### Transformers

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")

In [None]:
# dirty
classifier([raw])

[{'label': 'NEGATIVE', 'score': 0.9987456798553467}]

In [None]:
# clean
classifier([''.join(words)])

[{'label': 'NEGATIVE', 'score': 0.9751687049865723}]

# get the full sentiment table of all the audio

In [None]:
audio_file_csv = _os.path.join(audio_file_dir, '911_metadata.csv')
df = pd.read_csv(audio_file_csv)

In [None]:
import speech_recognition as sr

In [None]:
r = sr.Recognizer()
def get_text_from_audio(audio_file_name):
    dirr, name = audio_file_name.split('/')
    audio_file_path = _os.path.join('data', 'calls', dirr, name)
    with sr.AudioFile(audio_file_path) as src:
        r.adjust_for_ambient_noise(src, duration=0.5)
        audio = r.record(src)
        return r.recognize_whisper(audio, model='medium')

In [None]:
text_series = df['filename'][:100].apply(get_text_from_audio)
text_series

0                                My mom had a bad time.
1                     Hello. Hello. Is everything okay?
2      I need a police officer over here at 7th. Wha...
3      Ma'am, my pickup was stolen. I had to go find...
4      No ma'am I don't. I don't have an emergency. ...
                            ...                        
95     This is Gifford. Hello? Hello? Oh, number one...
96     across, forty seven oh nine fourty隔s avenue f...
97     I have, um, I need somebody escorted out of m...
98               Hello? Hello? Hello? Hello 911? Hello?
99     I got three zip-holes attacking people. Attac...
Name: filename, Length: 100, dtype: object

# lets use transformers pipeline to analyze sentiment

In [None]:
from transformers import pipeline

def get_sentiment_transformers(text):
    classifier = pipeline("sentiment-analysis")
    sent = classifier([text])
    return sent[0]['label']

In [None]:
sentiment_df = text_series.apply(get_sentiment_transformers)
sentiment_df

0     NEGATIVE
1     POSITIVE
2     NEGATIVE
3     NEGATIVE
4     NEGATIVE
        ...   
95    NEGATIVE
96    POSITIVE
97    NEGATIVE
98    NEGATIVE
99    NEGATIVE
Name: filename, Length: 100, dtype: object

In [None]:
full_df = pd.DataFrame([df['filename'][:100], text_series, sentiment_df]).T
full_df.columns=['filename', 'text', 'sentiment']
full_df

Unnamed: 0,filename,text,sentiment
0,911_first6sec/call_2_0.wav,My mom had a bad time.,NEGATIVE
1,911_first6sec/call_8_0.wav,Hello. Hello. Is everything okay?,POSITIVE
2,911_first6sec/call_9_0.wav,I need a police officer over here at 7th. Wha...,NEGATIVE
3,911_first6sec/call_10_0.wav,"Ma'am, my pickup was stolen. I had to go find...",NEGATIVE
4,911_first6sec/call_11_0.wav,No ma'am I don't. I don't have an emergency. ...,NEGATIVE
...,...,...,...
95,911_first6sec/call_110_0.wav,"This is Gifford. Hello? Hello? Oh, number one...",NEGATIVE
96,911_first6sec/call_111_0.wav,"across, forty seven oh nine fourty隔s avenue f...",POSITIVE
97,911_first6sec/call_112_0.wav,"I have, um, I need somebody escorted out of m...",NEGATIVE
98,911_first6sec/call_114_0.wav,Hello? Hello? Hello? Hello 911? Hello?,NEGATIVE


In [None]:
full_df[full_df['sentiment'] == 'NEGATIVE'].count()

filename     78
text         78
sentiment    78
dtype: int64

# lets use Vader pipeline to analyze sentiment

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_sentiment_vader(text):
    sid = SentimentIntensityAnalyzer()
    sent = sid.polarity_scores(text)
    compund = sent['compound']
    if compund > 1/3:
        return 'POSITIVE'
    elif compund < - 1/3:
        return 'NEGATIVE'
    else:
        return 'UNKOWN'

In [None]:
sentiment_df_vader = text_series.apply(get_sentiment_vader)
sentiment_df_vader

0     NEGATIVE
1       UNKOWN
2       UNKOWN
3     NEGATIVE
4       UNKOWN
        ...   
95      UNKOWN
96    NEGATIVE
97    NEGATIVE
98      UNKOWN
99    NEGATIVE
Name: filename, Length: 100, dtype: object

In [None]:
full_df_vader = pd.DataFrame([df['filename'][:100], text_series, sentiment_df_vader]).T
full_df_vader.columns=['filename', 'text', 'sentiment']
full_df_vader

Unnamed: 0,filename,text,sentiment
0,911_first6sec/call_2_0.wav,My mom had a bad time.,NEGATIVE
1,911_first6sec/call_8_0.wav,Hello. Hello. Is everything okay?,UNKOWN
2,911_first6sec/call_9_0.wav,I need a police officer over here at 7th. Wha...,UNKOWN
3,911_first6sec/call_10_0.wav,"Ma'am, my pickup was stolen. I had to go find...",NEGATIVE
4,911_first6sec/call_11_0.wav,No ma'am I don't. I don't have an emergency. ...,UNKOWN
...,...,...,...
95,911_first6sec/call_110_0.wav,"This is Gifford. Hello? Hello? Oh, number one...",UNKOWN
96,911_first6sec/call_111_0.wav,"across, forty seven oh nine fourty隔s avenue f...",NEGATIVE
97,911_first6sec/call_112_0.wav,"I have, um, I need somebody escorted out of m...",NEGATIVE
98,911_first6sec/call_114_0.wav,Hello? Hello? Hello? Hello 911? Hello?,UNKOWN


In [None]:
sentiment_df_vader[sentiment_df_vader == 'NEGATIVE'].count()

21

In [None]:
sentiment_df_vader[sentiment_df_vader == 'POSITIVE'].count()

20

In [None]:
import torch

In [None]:
print(torch.cuda.is_available())

False


In [None]:
%conda install -c conda-forge cudatoolkit=11.2 cudnn=8.1.0

^C

Note: you may need to restart the kernel to use updated packages.
