<a href="https://colab.research.google.com/github/rsidorchuk93/audio/blob/main/audio_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio processing

In [5]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# check that GPU is available
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.cuda.is_available()

False

In [7]:
# install relevant libraries
!pip install transformers pydub librosa resampy -q

## Upload audio files and explore them

In [8]:
import os
import librosa
from IPython.display import Audio

folder = '/content/drive/My Drive/test/audio'

file_list = os.listdir(folder)
for file_name in file_list:
    file_path = os.path.join(folder, file_name)
    audio_data, sampling_rate = librosa.load(file_path)
    duration_sec = len(audio_data) / sampling_rate
    duration_min = duration_sec // 60
    duration_sec = duration_sec % 60
    print(f"{file_name} ({int(duration_min):02d}:{int(duration_sec):02d} minutes)")
    Audio(audio_data, rate=sampling_rate)

me_stakeholder_collaboration_framework.ogg (05:01 minutes)
me_stakeholder_example_2 (04:05 minutes)
me_stakeholder_example_1 (06:38 minutes)


In [9]:
# play first 10 seconds of one of the files 
audio_data, sampling_rate = librosa.load(file_path, duration=10)
Audio(audio_data, rate=sampling_rate)

## Transcribe with Open AI Whisper transformer

In [10]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small.en")  # "openai/whisper-tiny.en"
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small.en")  

# set sampling rate and chunk size
sampling_rate = 16000
chunk_size = 160000 * 3  # 30 seconds with 16K sampling; Whisper can process the audio files in 30-second chunks

In [11]:
# Transcribe each file in the folder
results = []

for file_name in file_list:
    # read audio file
    audio_file = os.path.join(folder, file_name)
    input_signal, original_sampling_rate = torchaudio.load(audio_file)

    # get the number of frames in the audio file
    num_frames = input_signal.shape[1]

    # resample audio data
    resampler = torchaudio.transforms.Resample(original_sampling_rate, sampling_rate)
    input_signal = resampler(input_signal)

    # split audio into chunks
    audio_chunks = []
    for i in range(0, input_signal.shape[-1], chunk_size):
        if i + chunk_size < input_signal.shape[-1]:
            audio_chunks.append(input_signal[:, i:i+chunk_size])
        else:
            audio_chunks.append(input_signal[:, i:])

    # transcribe each chunk and concatenate results
    transcription = ""
    for chunk in audio_chunks:
        # preprocess audio
        input_features = processor(chunk.numpy()[0], sampling_rate=sampling_rate, return_tensors="pt").input_features

        # generate token ids
        predicted_ids = model.generate(input_features)

        # decode token ids to text
        chunk_transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        transcription += chunk_transcription[0]
    
    # calculate audio length
    audio_length_sec = num_frames / original_sampling_rate
    audio_length_min = audio_length_sec / 60
    audio_length_str = f"{int(audio_length_min):02d}:{int(audio_length_sec % 60):02d}"

    # save results for this file
    result = {'file_name': file_name, 'audio_length': audio_length_str, 'transcription': transcription}
    results.append(result)



In [12]:
# print transcribed text
results

[{'file_name': 'me_stakeholder_collaboration_framework.ogg',
  'audio_length': '05:01',
  'transcription': " When it comes to overall framework, there are three main aspects. The first is understanding why building relationships is important. Second is how to build relationships long-term. And third is how to deal with exact situation. First, it's important to realize that building trust and relationships with multiple stakeholders is very important. in part of the job, on par with technical skills. And this is especially important in more senior and executive roles. And this is also important, very important for data science roles, which unlike engineering, don't usually own their product. And also they are responsible for measurement. And it's very important to explain those nuances simultaneously to technical stakeholders in engineering and also non-technical stakeholders. for example, product managers or business stakeholders. Second is how do I actually approach building relations

In [16]:
import string

# Define a function to calculate words per minute
def calculate_wpm(transcription, audio_length_min):
    # Remove punctuation and split into words
    words = transcription.translate(str.maketrans('', '', string.punctuation)).split()

    # Calculate number of words per minute
    num_words = len(words)
    wpm = num_words / audio_length_min

    return wpm

# Analyze results and calculate words per minute
for result in results:
    file_name = result['file_name']
    audio_length = result['audio_length']
    transcription = result['transcription']

    # Calculate words per minute
    audio_length_min = int(audio_length.split(':')[0]) + int(audio_length.split(':')[1])/60
    wpm = calculate_wpm(transcription, audio_length_min)

    print(f"{file_name}: {wpm:.0f} words per minute")

me_stakeholder_collaboration_framework.ogg: 142 words per minute
me_stakeholder_example_2: 151 words per minute
me_stakeholder_example_1: 142 words per minute


## Recognize sentiment

In [20]:
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np
from nltk.tokenize import sent_tokenize

# Load model and tokenizer
model_name_or_path = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name_or_path)
model = DistilBertForSequenceClassification.from_pretrained(model_name_or_path)


In [22]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [35]:
# Load results and extract transcription
transcription = results[0]['transcription']

# Preprocess transcription
transcription = transcription.lower() # Convert to lowercase

# Split transcription into sentences
sentences = sent_tokenize(transcription)

# Tokenize and predict sentiment for each sentence
sentiment_scores = []
for sentence in sentences:
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**inputs)
    sentiment_score = outputs.logits.softmax(1).detach().numpy()
    sentiment_scores.append(sentiment_score[0])

sentiment_scores = np.array(sentiment_scores)

# Find indices of most positive and most negative sentiment
most_positive_idx = np.argmax(sentiment_scores[:, 1])
most_negative_idx = np.argmin(sentiment_scores[:, 1] - sentiment_scores[:, 0])

# Display most positive and most negative sentences and their sentiment scores
print("Most positive sentence:", sentences[most_positive_idx])
print("Sentiment score (positive):", sentiment_scores[most_positive_idx, 1])
print("Most negative sentence:", sentences[most_negative_idx])
print("Sentiment score (negative):", sentiment_scores[most_negative_idx, 0])

Most positive sentence: getting to know this person, this is important because for each of these things this will have impact in terms of how i actually interact with that person and if there is some similarities, for example same cultural background or similar professional goals, this is also something that can help to build trust and relationship.
Sentiment score (positive): 0.9996433
Most negative sentence: because if it's a small problem, maybe you can disregard it.
Sentiment score (negative): 0.9996369
