In [22]:
import speech_recognition as sr
from vosk import Model, KaldiRecognizer
import os as _os

In [23]:
audio_file_dir = _os.path.join('data', 'calls', '911_first6sec')
audio_file_path = _os.path.join(audio_file_dir, 'call_10_0.wav')

In [3]:
r = sr.Recognizer()

# We will first try to decipher the audio as is using four different speach to text engines: Google, Sphinx, Vosk & Whisper (all engines that have free APIs). The transcription of the text is: "Ma'am, my pickup was stolen, I had to go find it. ahh"

In [4]:
with sr.AudioFile(audio_file_path) as src:
    audio = r.record(src)

In [55]:
r.recognize_google(audio)

'damn my pickup was born I had to go find it'

In [56]:
r.recognize_sphinx(audio)

'no wonder the political had invited laugh'

In [57]:
r.recognize_whisper(audio)

" Ma'am, my cat is a person. I had it offended."

In [58]:
model_path = _os.path.join('model', 'vosk', 'eng_small')
vosk_audio = audio.get_raw_data(convert_rate=16000, convert_width=2)

#
model = Model(model_path)
recognizer = KaldiRecognizer(model, 16000)
recognizer.AcceptWaveform(vosk_audio);
recognizer.FinalResult()

'{\n  "text" : "now my pick up on to go find that ah"\n}'

# We can see that only google was remotely close, albeit only whisper was able to decipher the first word correctly. Now lets try cleaning the audio up:

In [13]:
with sr.AudioFile(audio_file_path) as src:
    r.adjust_for_ambient_noise(src, duration=0.5)    
    audio_clean = r.record(src)

In [59]:
r.recognize_google(audio_clean)

'damn my pickup was born I had to go find it'

In [60]:
r.recognize_sphinx(audio_clean)

'my own mind at football has invited up'

In [61]:
r.recognize_whisper(audio_clean, language='english')

" Ma'am, my kekep was born. I had to go find it."

In [62]:
model_path = _os.path.join('model', 'vosk', 'eng_small')
vosk_audio = audio_clean.get_raw_data(convert_rate=16000, convert_width=2)

#
model = Model(model_path)
recognizer = KaldiRecognizer(model, 16000)
recognizer.AcceptWaveform(vosk_audio);
recognizer.FinalResult()

'{\n  "text" : "well my pick up on had to go find that"\n}'

we can see that google hasn't been affected by the atttempted cleaning, with vosk doing a lateral move, and sphinx is just shit.
whisper has seen marginal improvement, yet not enough to rival google, lets try this with some heavier models:

Now lets try with heavier models with the original audio: 

In [15]:
%%timeit
r.recognize_whisper(audio, model='large', language='english')

  @numba.jit


22.6 s ± 388 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [64]:
r.recognize_whisper(audio, model='large', language='english')

" Ma'am, my pickup was stolen. I had to go find it. AGH!"

# VOSK

In [20]:
def vosk_rec(audio_data, model_name):
    model_path = _os.path.join('model', 'vosk', model_name)
    vosk_audio = audio_data.get_raw_data(convert_rate=16000, convert_width=2)

    #
    model = Model(model_path)
    recognizer = KaldiRecognizer(model, 16000)
    recognizer.AcceptWaveform(vosk_audio);
    return recognizer.FinalResult()

### regular audio / large model

In [24]:
%%timeit
vosk_rec(audio, 'eng_large')

35.9 s ± 13.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
vosk_rec(audio, 'eng_large')

'{\n  "text" : "ma\'am my pickup was gone i had to go find it"\n}'

### regular audio / small model

In [22]:
%%timeit
vosk_rec(audio, 'eng_small')

2.04 s ± 16.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
vosk_rec(audio, 'eng_small')

'{\n  "text" : "now my pick up on to go find that ah"\n}'

### regular audio / lgraph model

In [26]:
%%timeit
vosk_rec(audio, 'vosk-model-en-us-0.22-lgraph')

10.6 s ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
vosk_rec(audio, 'vosk-model-en-us-0.22-lgraph')

'{\n  "text" : "ma\'am my pickup post gone i had to go find it"\n}'

### regular audio / giant model

In [28]:
%%timeit
vosk_rec(audio, 'vosk-model-en-us-0.42-gigaspeech')

48.3 s ± 2.45 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
vosk_rec(audio, 'vosk-model-en-us-0.42-gigaspeech')

'{\n  "text" : "well my pickup was gone i had to go find that ah"\n}'

### clean audio / large model

In [30]:
%%timeit
vosk_rec(audio_clean, 'eng_large')

29.6 s ± 135 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
vosk_rec(audio_clean, 'eng_large')

'{\n  "text" : "well my pickup was gone i had to go find it"\n}'

### clean audio / small model

In [32]:
%%timeit
vosk_rec(audio_clean, 'eng_small')

2.01 s ± 8.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
vosk_rec(audio_clean, 'eng_small')

'{\n  "text" : "well my pick up on had to go find that ah"\n}'

### clean audio / lgraph model

In [34]:
%%timeit
vosk_rec(audio_clean, 'vosk-model-en-us-0.22-lgraph')

10.5 s ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
vosk_rec(audio_clean, 'vosk-model-en-us-0.22-lgraph')

'{\n  "text" : "well my a post gone i to go find that"\n}'

### clean audio / giant model

In [36]:
%%timeit
vosk_rec(audio_clean, 'vosk-model-en-us-0.42-gigaspeech')

49.1 s ± 639 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
vosk_rec(audio_clean, 'vosk-model-en-us-0.42-gigaspeech')

'{\n  "text" : "well my pickup was gone i had to go find it"\n}'

# Whisper

### regular audio / tiny model

In [39]:
%%timeit
r.recognize_whisper(audio, model='tiny')

100%|█████████████████████████████████████| 72.1M/72.1M [00:10<00:00, 6.93MiB/s]


1.06 s ± 23.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
r.recognize_whisper(audio, model='tiny')

' Now my character postponed. I had it offended.'

### regular audio / base model

In [41]:
%%timeit
r.recognize_whisper(audio, model='base')

2.38 s ± 55.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [42]:
r.recognize_whisper(audio, model='base')

" Ma'am, my cat is a person. I had it offended."

### regular audio / small model

In [43]:
%%timeit
r.recognize_whisper(audio, model='small')

100%|███████████████████████████████████████| 461M/461M [00:53<00:00, 8.97MiB/s]


7.15 s ± 70.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
r.recognize_whisper(audio, model='small')

" Ma'am, my cat got postponed. I had to go find it."

### regular audio / medium model

In [45]:
%%timeit
r.recognize_whisper(audio, model='medium')

100%|█████████████████████████████████████| 1.42G/1.42G [03:08<00:00, 8.11MiB/s]


21.7 s ± 302 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
r.recognize_whisper(audio, model='medium')

" Ma'am, my paper was stolen. I had to go find it. Ah!"

### regular audio / large model

In [60]:
%%timeit
r.recognize_whisper(audio, model='large')

40.6 s ± 2.17 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
r.recognize_whisper(audio, model='large')

" Ma'am, my pickup was stolen. I had to go find it. AGH!"

### clean audio / tiny model

In [49]:
%%timeit
r.recognize_whisper(audio_clean, model='tiny')

3.77 s ± 1.24 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
r.recognize_whisper(audio_clean, model='tiny')

" then I'll check their post on I had it recorded"

### regular audio / base model

In [51]:
%%timeit
r.recognize_whisper(audio_clean, model='base')

2.59 s ± 31.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
r.recognize_whisper(audio_clean, model='base')

" Ma'am, my kekep was born. I had to go find it."

### regular audio / small model

In [53]:
%%timeit
r.recognize_whisper(audio_clean, model='small')

7.5 s ± 71.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
r.recognize_whisper(audio_clean, model='small')

" Ma'am, my kekka pushed on. I had to go find it."

### regular audio / medium model

In [55]:
%%timeit
r.recognize_whisper(audio_clean, model='medium')

22.5 s ± 94.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [56]:
r.recognize_whisper(audio_clean, model='medium')

" Ma'am, my pickup was stolen. I had to go find it. Ah!"

### regular audio / large model

In [59]:
%%timeit
r.recognize_whisper(audio_clean, model='large')

2min 26s ± 36.7 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
r.recognize_whisper(audio_clean, model='large')

" Ma'am, my pickup was stolen. I had to go find it. Ahh!"

# lets try to reduce the noise

In [63]:
import noisereduce as nr
from scipy.io import wavfile

In [64]:
audio_file_32_path = _os.path.join('data', 'calls', '911_first6sec', 'call_32_0.wav')

In [66]:
# load data
rate, data = wavfile.read(audio_file_32_path)
print(rate, data)
# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate)
wavfile.write("mywav_reduced_noise.wav", rate, reduced_noise)

48000 [[    0     0]
 [    0     0]
 [    0     0]
 ...
 [10792 10799]
 [10601 10608]
 [10238 10243]]


MemoryError: Unable to allocate 129. GiB for an array with shape (288000, 60002) and data type float64

# Sentiment Analysis

### UTILS

In [6]:
from nltk.corpus import stopwords
import pandas as pd
import string
from nltk.tokenize import RegexpTokenizer

def tokenize(raw):
    tokenizer = RegexpTokenizer(r"\w+\'?\w*")
    return tokenizer.tokenize(raw)

def preprocess_words_fast(tokens):
    t = pd.Series(tokens).str.lower()
    t = t[~(t.isin(stopwords.words('english')) | t.isin(list(string.punctuation)))]
    return t.tolist()

In [7]:
raw = " Ma'am, my pickup was stolen. I had to go find it. Ahh!"
tokens = tokenize(raw)
words = preprocess_words_fast(tokens)
print(words)

["ma'am", 'pickup', 'stolen', 'go', 'find', 'ahh']


### vader

In [96]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [122]:
# dirty
sid.polarity_scores(raw)

{'neg': 0.259, 'neu': 0.741, 'pos': 0.0, 'compound': -0.5411}

In [123]:
# clean
sid.polarity_scores(' '.join(words))

{'neg': 0.39, 'neu': 0.61, 'pos': 0.0, 'compound': -0.4939}

### textblob

In [124]:
from textblob import TextBlob

In [125]:
# dirty
testimonial = TextBlob(raw)
print(testimonial.sentiment)

Sentiment(polarity=0.0, subjectivity=0.0)


In [126]:
# clean
testimonial = TextBlob(' '.join(words))
print(testimonial.sentiment)

Sentiment(polarity=0.0, subjectivity=0.0)


### Flair

In [4]:
from flair.models import TextClassifier
from flair.data import Sentence

In [15]:
classifier = TextClassifier.load('en-sentiment')

In [16]:
sentence = Sentence(raw)
classifier.predict(sentence)

# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

Sentence above is:  ['Sentence[16]: " Ma'am, my pickup was stolen. I had to go find it. Ahh!"'/'NEGATIVE' (0.9804)]


In [17]:
sentence = Sentence(' '.join(words))
classifier.predict(sentence)

# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

Sentence above is:  ['Sentence[6]: "ma'am pickup stolen go find ahh"'/'NEGATIVE' (0.9505)]


### Transformers

In [10]:
from transformers import pipeline

In [18]:
classifier = pipeline("sentiment-analysis")

In [19]:
# dirty
classifier([raw])

[{'label': 'NEGATIVE', 'score': 0.9987456798553467}]

In [20]:
# clean
classifier([''.join(words)])

[{'label': 'NEGATIVE', 'score': 0.9751687049865723}]

# get the full sentiment table of all the audio

In [28]:
audio_file_csv = _os.path.join(audio_file_dir, '911_metadata.csv')
df = pd.read_csv(audio_file_csv)

In [30]:
import speech_recognition as sr

In [35]:
r = sr.Recognizer()
def get_text_from_audio(audio_file_name):
    dirr, name = audio_file_name.split('/')
    audio_file_path = _os.path.join('data', 'calls', dirr, name)
    with sr.AudioFile(audio_file_path) as src:
        r.adjust_for_ambient_noise(src, duration=0.5)    
        audio = r.record(src)
        return r.recognize_whisper(audio, model='medium')

In [77]:
text_series = df['filename'][:100].apply(get_text_from_audio)
text_series

0                                My mom had a bad time.
1                     Hello. Hello. Is everything okay?
2      I need a police officer over here at 7th. Wha...
3      Ma'am, my pickup was stolen. I had to go find...
4      No ma'am I don't. I don't have an emergency. ...
                            ...                        
95     This is Gifford. Hello? Hello? Oh, number one...
96     across, forty seven oh nine fourty隔s avenue f...
97     I have, um, I need somebody escorted out of m...
98               Hello? Hello? Hello? Hello 911? Hello?
99     I got three zip-holes attacking people. Attac...
Name: filename, Length: 100, dtype: object

# lets use transformers pipeline to analyze sentiment

In [78]:
from transformers import pipeline

def get_sentiment_transformers(text):
    classifier = pipeline("sentiment-analysis")
    sent = classifier([text])
    return sent[0]['label']

In [80]:
sentiment_df = text_series.apply(get_sentiment_transformers)
sentiment_df

0     NEGATIVE
1     POSITIVE
2     NEGATIVE
3     NEGATIVE
4     NEGATIVE
        ...   
95    NEGATIVE
96    POSITIVE
97    NEGATIVE
98    NEGATIVE
99    NEGATIVE
Name: filename, Length: 100, dtype: object

In [86]:
full_df = pd.DataFrame([df['filename'][:100], text_series, sentiment_df]).T
full_df.columns=['filename', 'text', 'sentiment']
full_df

Unnamed: 0,filename,text,sentiment
0,911_first6sec/call_2_0.wav,My mom had a bad time.,NEGATIVE
1,911_first6sec/call_8_0.wav,Hello. Hello. Is everything okay?,POSITIVE
2,911_first6sec/call_9_0.wav,I need a police officer over here at 7th. Wha...,NEGATIVE
3,911_first6sec/call_10_0.wav,"Ma'am, my pickup was stolen. I had to go find...",NEGATIVE
4,911_first6sec/call_11_0.wav,No ma'am I don't. I don't have an emergency. ...,NEGATIVE
...,...,...,...
95,911_first6sec/call_110_0.wav,"This is Gifford. Hello? Hello? Oh, number one...",NEGATIVE
96,911_first6sec/call_111_0.wav,"across, forty seven oh nine fourty隔s avenue f...",POSITIVE
97,911_first6sec/call_112_0.wav,"I have, um, I need somebody escorted out of m...",NEGATIVE
98,911_first6sec/call_114_0.wav,Hello? Hello? Hello? Hello 911? Hello?,NEGATIVE


In [87]:
full_df[full_df['sentiment'] == 'NEGATIVE'].count()

filename     78
text         78
sentiment    78
dtype: int64

# lets use Vader pipeline to analyze sentiment

In [93]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_sentiment_vader(text):
    sid = SentimentIntensityAnalyzer()    
    sent = sid.polarity_scores(text)
    compund = sent['compound']
    if compund > 1/3:
        return 'POSITIVE'
    elif compund < - 1/3:
        return 'NEGATIVE'
    else:
        return 'UNKOWN'

In [94]:
sentiment_df_vader = text_series.apply(get_sentiment_vader)
sentiment_df_vader

0     NEGATIVE
1       UNKOWN
2       UNKOWN
3     NEGATIVE
4       UNKOWN
        ...   
95      UNKOWN
96    NEGATIVE
97    NEGATIVE
98      UNKOWN
99    NEGATIVE
Name: filename, Length: 100, dtype: object

In [99]:
full_df_vader = pd.DataFrame([df['filename'][:100], text_series, sentiment_df_vader]).T
full_df_vader.columns=['filename', 'text', 'sentiment']
full_df_vader

Unnamed: 0,filename,text,sentiment
0,911_first6sec/call_2_0.wav,My mom had a bad time.,NEGATIVE
1,911_first6sec/call_8_0.wav,Hello. Hello. Is everything okay?,UNKOWN
2,911_first6sec/call_9_0.wav,I need a police officer over here at 7th. Wha...,UNKOWN
3,911_first6sec/call_10_0.wav,"Ma'am, my pickup was stolen. I had to go find...",NEGATIVE
4,911_first6sec/call_11_0.wav,No ma'am I don't. I don't have an emergency. ...,UNKOWN
...,...,...,...
95,911_first6sec/call_110_0.wav,"This is Gifford. Hello? Hello? Oh, number one...",UNKOWN
96,911_first6sec/call_111_0.wav,"across, forty seven oh nine fourty隔s avenue f...",NEGATIVE
97,911_first6sec/call_112_0.wav,"I have, um, I need somebody escorted out of m...",NEGATIVE
98,911_first6sec/call_114_0.wav,Hello? Hello? Hello? Hello 911? Hello?,UNKOWN


In [97]:
sentiment_df_vader[sentiment_df_vader == 'NEGATIVE'].count()

21

In [98]:
sentiment_df_vader[sentiment_df_vader == 'POSITIVE'].count()

20