<a href="https://colab.research.google.com/github/sanjanasridharcs/covid-demographics/blob/master/Generate_GPT_Prompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pydub
!pip install praat-parselmouth

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.3


In [None]:
# setup stuff + definitions

# acoustic parameters
import parselmouth
import librosa
import numpy as np
import pickle
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

def analyze_wav_file(file_path):
    # Load the WAV file
    sound = parselmouth.Sound(file_path)

    # Analyze pitch
    F0 = sound.to_pitch()
    F0_values = F0.selected_array['frequency']
    F0_average = np.average(F0_values)
    F0_max = np.max(F0_values)
    F0_min = F0_values[0]
    for x in F0_values:
      if (x < F0_min):
        if (x != 0):
          F0_min = x
    F0_variability = 0
    for x in F0_values:
      F0_variability += (x - F0_average) ** 2
    F0_variability /= len(F0_values)
    F0_variability = F0_variability ** (1/2)

    # Analyze intensity
    intensity = sound.to_intensity()
    intensity_values = intensity.values.T
    intensity_max = np.max(intensity_values)
    intensity_variability = 0;
    for x in intensity_values:
      intensity_variability += (x - np.average(intensity_values)) ** 2
    intensity_variability /= len(intensity_values)
    intensity_variability = intensity_variability ** (1/2)
    intensity_min = np.min(intensity_values)

    #Analyze % silence
    percent_silence = 0
    for x in intensity_values:
      if x < 45:
        percent_silence += 1
    percent_silence /= len(intensity_values)
    percent_silence *= 100

    return {
       "F0_average": F0_average,
       "F0_variability": F0_variability,
       "F0_max": F0_max,
       "F0_min": F0_min,
       "intensity_variability": intensity_variability,
       "intensity_max": intensity_max,
       "intensity_min": intensity_min,
       "% silence": percent_silence
    }

# CREMA-D SNN model
print("downloading CREMA D SNN model")
!gdown 12UGtFKxLara0JoOsY9_VLWumbZd_hYnW

# Anxiety SNN model
print("downloading anxiety SNN model")
!gdown 1kOAlxmRpZB63P22LIzcP4_GvPUlmzDWg

# SNN model
class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.activation = torch.nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        s = self.softmax(x)
        return x

# Load the models
with open("EmotionPredictionModel.pkl", 'rb') as f1:
  cremad_snn_model = pickle.load(f1)

with open("AnxietyPredictionModel.pkl", 'rb') as f2:
  anxiety_snn_model = pickle.load(f2)

# functions for CREMA-D
def wav_to_embedding(file_path, n_mfcc=13):
  y, sr = librosa.load(file_path)

  # Extract MFCC features
  mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

  # Normalize the MFCCs
  mfccs_normalized = np.mean(mfccs.T, axis=0)

  return mfccs_normalized

def predict_CREMAD_SNN_sentiment(file_path):
  embedding = wav_to_embedding(file_path)
  embedding_tensor = torch.tensor(embedding, dtype=torch.float32)

  # Pass the embedding through your trained model
  cremad_snn_model.eval()  # Set the model to evaluation mode
  with torch.no_grad():
      output = cremad_snn_model(embedding_tensor.unsqueeze(0))

  probabilities = F.softmax(output, dim=1)[0]

  # Round each value to two decimal places
  rounded_values = np.round(probabilities.numpy() * 100, 2)

  return rounded_values

def predict_anxiety_SNN_sentiment(file_path):
  embedding = wav_to_embedding(file_path)
  embedding_tensor = torch.tensor(embedding, dtype=torch.float32)

  # Pass the embedding through your trained model
  anxiety_snn_model.eval()  # Set the model to evaluation mode
  with torch.no_grad():
      output = anxiety_snn_model(embedding_tensor.unsqueeze(0))

  probabilities = F.softmax(output, dim=1)[0]

  # Round each value to two decimal places
  rounded_values = np.round(probabilities.numpy() * 100, 2)

  return rounded_values


# CREMA-D CNN model
!gdown 1kmBX3qh9Ndzx81Mbp5zbLeaSEXnECHAE

# Anxiety CNN model
!gdown 1UrAvKQHndFNiaVDlAtAfPO24W9AU6kYp


# Function to pad width with silent space (zeroes)
def equalize_width(spectrogram, target_width):
    current_width = spectrogram.shape[1]
    if current_width < target_width:
        # If the current width is smaller, pad the spectrogram
        padding = np.zeros((spectrogram.shape[0], target_width - current_width))
        equalized_spectrogram = np.hstack((spectrogram, padding))
    elif current_width > target_width:
        # If the current width is larger, trim the spectrogram
        equalized_spectrogram = spectrogram[:, :target_width]
    else:
        # If the width is already equal, return the original spectrogram
        equalized_spectrogram = spectrogram
    return equalized_spectrogram

num_classes = 6

class CNNModel(nn.Module):
    def __init__(self, input_channels, height, width):
        super(CNNModel, self).__init__()
        # Define your convolutional layers here
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Add more conv layers if needed
        )
        # Add more layers as necessary

        # Pass a dummy tensor through the conv layers to get the output size
        with torch.no_grad():
            dummy_input = torch.zeros((1, input_channels, height, width))
            dummy_output = self.conv_layers(dummy_input)
            flattened_size = int(np.prod(dummy_output.size()[1:]))
aww
        # Define the fully connected layers using the correct flattened size
        self.fc_layers = nn.Sequential(
            nn.Linear(flattened_size, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
            # Add more fc layers if needed
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor for the fully connected layers
        x = self.fc_layers(x)
        return x

# Load the models
with open("EmotionPredictionModelCNN.pkl", 'rb') as f3:
  cremad_cnn_model = pickle.load(f3)

with open("AnxietyPredictionModelCNN.pkl", 'rb') as f4:
  anxiety_cnn_model = pickle.load(f4)

def predict_CREMAD_CNN_sentiment(filepath):
  y, sr = librosa.load(filepath)
  S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
  S_DB = librosa.power_to_db(S, ref=np.max)
  S_DB = equalize_width(S_DB, 174)

  # Ensure the output shape is (1, C, H, W)
  S_DB = np.expand_dims(S_DB, axis=0)  # Add a channel dimension
  S_DB = np.expand_dims(S_DB, axis=0)  # Add a batch size dimension
  input = torch.tensor(S_DB, dtype=torch.float32)

  cremad_cnn_model.eval()
  with torch.no_grad():
    prediction = cremad_cnn_model(input)
    prediction_index = torch.argmax(prediction, dim=1)

  emotions = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise"]
  predicted_emotion = emotions[prediction_index.item()]
  return predicted_emotion

def predict_anxiety_CNN_sentiment(filepath):
  y, sr = librosa.load(filepath)
  S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
  S_DB = librosa.power_to_db(S, ref=np.max)
  S_DB = equalize_width(S_DB, 174)

  # Ensure the output shape is (1, C, H, W)
  S_DB = np.expand_dims(S_DB, axis=0)  # Add a channel dimension
  S_DB = np.expand_dims(S_DB, axis=0)  # Add a batch size dimension
  input = torch.tensor(S_DB, dtype=torch.float32)

  anxiety_cnn_model.eval()
  with torch.no_grad():
    prediction = anxiety_cnn_model(input)
    prediction_index = torch.argmax(prediction, dim=1)

  print(prediction)

  emotions = ['Angry', 'Anxious', 'Apologetic', 'Assertive', 'Concerned', 'Encouraging', 'Excited', 'Happy', 'Neutral', 'Sad']
  predicted_emotion = emotions[prediction_index.item()]
  return predicted_emotion


def print_cremad(prediction):
  labels = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise"]
  for i, label in enumerate(labels):
    print(label + ": " + str(prediction[i]) + "%")

def print_anxiety(prediction):
  labels = ['Angry', 'Anxious', 'Apologetic', 'Assertive', 'Concerned', 'Encouraging', 'Excited', 'Happy', 'Neutral', 'Sad']
  for i, label in enumerate(labels):
    print(label + ": " + str(prediction[i]) + "%")

downloading CREMA D SNN model
Downloading...
From: https://drive.google.com/uc?id=12UGtFKxLara0JoOsY9_VLWumbZd_hYnW
To: /content/EmotionPredictionModel.pkl
100% 3.32k/3.32k [00:00<00:00, 17.1MB/s]
downloading anxiety SNN model
Downloading...
From: https://drive.google.com/uc?id=1kOAlxmRpZB63P22LIzcP4_GvPUlmzDWg
To: /content/AnxietyPredictionModel.pkl
100% 3.50k/3.50k [00:00<00:00, 14.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kmBX3qh9Ndzx81Mbp5zbLeaSEXnECHAE
To: /content/EmotionPredictionModelCNN.pkl
100% 91.2M/91.2M [00:00<00:00, 151MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UrAvKQHndFNiaVDlAtAfPO24W9AU6kYp
To: /content/AnxietyPredictionModelCNN_old.pkl
100% 91.2M/91.2M [00:01<00:00, 80.2MB/s]


FileNotFoundError: [Errno 2] No such file or directory: 'AnxietyPredictionModelCNN.pkl'

In [None]:
# ACTION: upload m4a file into Colab as "recording.m4a"

from pydub import AudioSegment

# convert m4a to wav
m4a_file = 'sample.m4a'
wav_file = 'input.wav'

audio = AudioSegment.from_file(m4a_file, format="m4a")
audio.export(wav_file, format="wav")

# get acoustic parameters
acoustic_params = analyze_wav_file("input.wav")

# get emotion prediction via CREMA-D sentiment SNN
crema_sentiment_snn = predict_CREMAD_SNN_sentiment("input.wav")

# get emotion prediction via CREMA-D sentiment CNN
crema_sentiment_cnn = predict_CREMAD_CNN_sentiment("input.wav")

# get emotion prediction via anxiety sentiment SNN
anxiety_sentiment_snn = predict_anxiety_SNN_sentiment("input.wav")

# get emotion prediction via anxiety sentiment CNN
anxiety_sentiment_cnn = predict_anxiety_CNN_sentiment("input.wav")

# print GPT prompt
print("Please generate an anxiety score for a wav file with the following parameters:")
print()
print("The acoustic parameters are:")
print(acoustic_params)
print()
print("The emotion breakdown by percentage from a simple neural network model for the CREMA-D dataset is:")
print_cremad(crema_sentiment_snn)
print()
print("The emotion breakdown by percentage from a simple neural network model for the JL-corpus dataset is:")
print_anxiety(anxiety_sentiment_snn)
print()
print("The emotion prediction from a convolutional neural network model for the CREMA-D dataset is:")
print(crema_sentiment_cnn)
print()
print("The emotion breakdown by percentage from a convolutional neural network model for the JL-corpus dataset is:")
print(anxiety_sentiment_cnn)

tensor([[0.0809, 0.0485, 0.0965, 0.1140, 0.0500, 0.0938, 0.0685, 0.0616, 0.0792,
         0.0712]])
Please generate an anxiety score for a wav file with the following parameters:

The acoustic parameters are:
{'F0_average': 89.84463727530301, 'F0_variability': 89.88775892216091, 'F0_max': 193.6008804646606, 'F0_min': 0.0, 'intensity_variability': array([13.73089022]), 'intensity_max': 75.18166924003457, 'intensity_min': 33.17294339227881, '% silence': 29.72972972972973}

The emotion breakdown by percentage from a simple neural network model for the CREMA-D dataset is:
Angry: 1.01%
Disgust: 0.0%
Fear: 96.68%
Happy: 2.31%
Sad: 0.0%
Surprise: 0.0%

The emotion breakdown by percentage from a simple neural network model for the JL-corpus dataset is:
Angry: 49.82%
Anxious: 5.12%
Apologetic: 0.0%
Assertive: 0.0%
Concerned: 2.93%
Encouraging: 0.01%
Excited: 41.95%
Happy: 0.17%
Neutral: 0.0%
Sad: 0.0%

The emotion prediction from a convolutional neural network model for the CREMA-D dataset is:
