# Installation and Import of required libraries

In [None]:
# Import required libraries
import openai
import os
import json
import warnings
import whisper
import gradio as gr
import numpy as np
import pandas as pd
import librosa 
import librosa.display
import IPython.display as ipd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from gtts import gTTS
from tqdm import tqdm_notebook as tqdm
from flask import request
from scipy.io import wavfile as wav
from sklearn import metrics
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

# Prerequisite 

In [None]:
# Hide unwanted warnings
warnings.filterwarnings("ignore")

In [None]:
# OpenAI API Key
with open("OPENAI_API_KEY.json") as f:
    secrets = json.load(f)
    openai_key = secrets["openai"]["key"]

In [None]:
# Authenticate with OpenAI API using API key
openai.api_key = openai_key

In [None]:
# Load Base model of Whisper
model = whisper.load_model("base")

# Check the device used by model
model.device

In [None]:
# Create a temporary mp3 file for storing audio data if it does not exist already
##!../anaconda3/bin/ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3

# Define function for transcribing user's voice input using OpenAI's Whisper

In [None]:
# Define function for transcribing user's voice input using OpenAI's Whisper
def transcribe(audio):
    
    # Load audio and trim audio to 30 sec
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    # Decode the audio
    result = model.transcribe(audio)
    user_text = result["text"]
    
    return user_text

# Define function for generating text response using OpenAI's GPT

In [None]:
# Define function for generating text response using OpenAI's GPT
def generate_response(prompt):
    
    # System defined role
    messages = [
        {"role": "system", "content": "You are a friendly and helpful AI voice assistant. Generate text responses when provided with a chat history, your text will be used as the answer using text-to-speech API"}]
    
    # Generate and return the response for a user prompt
    if prompt:
        messages.append(
        {"role": "user", "content": prompt},
        )
        chat = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=messages
            )
        
    reply = chat.choices[0].message.content
    return reply

# MFCCs Extraction for Accent Detection

In [None]:
# Assign the filepath to a sample audio for testing
filename = 'accentdb_extended/data/american/speaker_01/american_s01_676.wav'

# Displaying the wavelenght of the filename american audio
Librosa_data, Librosa_sample_rate = librosa.load(filename)
librosa.display.waveshow(Librosa_data, sr=Librosa_sample_rate)
plt.figure(figsize = (14,5))
ipd.Audio(filename)

In [None]:
# Librosa converts the signal to mono, meaning the channel will always be 1
print('Librosa sample rate = ', Librosa_sample_rate)
print('Mono Audio of Librosa_data:', Librosa_data)
wave_sample_rate, wave_audio = wav.read(filename)
wave_audio

In [None]:
# Original audio
plt.figure(figsize=(12, 4))
plt.plot(wave_audio)

In [None]:
mfccs = librosa.feature.mfcc(y=Librosa_data, sr=Librosa_sample_rate, n_mfcc=20)
print(mfccs.shape)
type(mfccs)

In [None]:
librosa.display.specshow(mfccs, sr=Librosa_sample_rate, x_axis='time')

In [None]:
# Define a function for extracting the filenames of all audio files
def feature_extractor():
    DIR1 = 'accentdb_extended/data'
    DIR2 = 'accentdb_core/data'
    dir_name = os.listdir(DIR1)
    dir2_name = os.listdir(DIR2)
    speaker_files = []
    #Mapping all the audiofiles at the accentdb_extend folder
    for folder_name in dir_name:
        for speaker in os.listdir(os.path.join(DIR1,folder_name)):
            for audio in os.listdir(os.path.join(DIR1,folder_name,speaker)):
                filename_speaker = os.path.join(DIR1,folder_name,speaker,audio)
                speaker_files.append([folder_name, filename_speaker])
    #Mapping all the audiofiles at the accentdb_core folder          
    for folder2_name in dir2_name:
        for speaker2 in os.listdir(os.path.join(DIR2,folder2_name)):
            for audio2 in os.listdir(os.path.join(DIR2,folder2_name,speaker2)):
                filename2_speaker = os.path.join(DIR2,folder2_name,speaker2,audio2)
                speaker_files.append([folder2_name, filename2_speaker])
        
    return speaker_files

f = feature_extractor()
f[0]

In [None]:
# Define a function to extract all MFCCS of audio data in the dataset folder
def extract_mfcc_features(audio_path):
    # Load audio file with Librosa
    signal, sample_rate = librosa.load(audio_path)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=20)

    return mfccs

In [None]:
audio_data = pd.DataFrame(f, columns = ['Speaker','audio_path'])
print(audio_data.head())
print(audio_data.shape)
print(audio_data['Speaker'].unique())

In [None]:
%%time

# Extract features for all audio files
speaker_files = feature_extractor()
data_extracted = []
file_path = audio_data['audio_path'].tolist()
audio_name = audio_data['Speaker'].tolist()
class_number = audio_data['Speaker'].unique()
for i in range(len(file_path)):
    mfccs = extract_mfcc_features(file_path[i])
    data_extracted.append([mfccs, audio_name[i]])
    
data = pd.DataFrame(data_extracted, columns=['Features', 'audio_name'])

In [None]:
data.head()

In [None]:
X = data['Features'].tolist()
print(X[0].shape)

In [None]:
# Split the dataset into independent and dependent dataset
X = data['Features'].tolist()
X = [np.mean(i.T,axis = 0) for i in X]
X = np.asarray(X)
print(X.shape)

In [None]:
#Transform the y_name into a number 
y_name = [i[0] for i in f]
y = np.array(pd.get_dummies(y_name, dtype=float))
print(y.shape)

In [None]:
# Standardization of x_data (independent variables
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0]

In [None]:
### Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# ANN Modelling (4 layer Sequential)

In [None]:
print(tf.__version__)

In [None]:
### No of classes
num_labels=y.shape[1]

In [None]:
ann_model=Sequential()
###first layer
ann_model.add(Dense(100,input_shape=(20,)))
ann_model.add(Activation('relu'))
ann_model.add(Dropout(0.5))
###second layer
ann_model.add(Dense(200))
ann_model.add(Activation('relu'))
ann_model.add(Dropout(0.5))
###third layer
ann_model.add(Dense(100))
ann_model.add(Activation('relu'))
ann_model.add(Dropout(0.5))

###final layer
ann_model.add(Dense(num_labels))
ann_model.add(Activation('softmax'))

In [None]:
ann_model.summary()

In [None]:
ann_model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [None]:
## Trianing the ANN model

num_epochs = 200
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='accent_classification.hdf5', 
                               verbose=1, save_best_only=True,patience=10)

start = datetime.now()

history = ann_model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
ann_model.evaluate(X_test, y_test)

In [None]:
# Evaluation of ANN model

# Plot training and validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

# Plot training and validation accuracy values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

In [None]:
# Confusion Matrix

# Get the prediction from the X_test Dataset
prediction_ANN = ann_model.predict(X_test)
prediction_ANN_rounded = [np.argmax(i) for i in prediction_ANN]
prediction_ANN_rounded[0]
y_test_index = [np.argmax(i) for i in y_test]

In [None]:
# Confusion Matrix - verify accuracy of each class
cm = tf.math.confusion_matrix(labels = y_test_index, predictions = prediction_ANN_rounded)
plt.figure(figsize = (10,7))
sns.heatmap(cm,annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('True_value')

In [None]:
#Transform the Prediction_ANN_rounded into a categorical
speaker = ['american','welsh','telugu','bangla','australian','british','odiya',
 'indian','malayalam']
#for the prediction dataset
predicted_audio = X_test[0]
predicted_audio = np.expand_dims(predicted_audio, axis=0) 
# reshape for prediction
predicted_index = np.argmax(ann_model.predict(predicted_audio), axis=-1)[0]
predicted_speaker = speaker[predicted_index]

print('Predicted Audio: ' + predicted_speaker)
print('Test Audio: ', speaker[np.argmax(y_test[0])])

# Using the trained model in voice assistant

In [None]:
# Extract all MFCCS of audio data 
def extract_mfcc_features(audio_path):
    # Load audio file with Librosa
    signal, sample_rate = librosa.load(audio_path)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=20)

    return mfccs

In [None]:
# Define a function to get to top-level domain (tld) for gTTS

def get_domain(predicted_speaker):
    if predicted_speaker == 'american':
        return 'us', 'American'
    elif predicted_speaker == 'australian':
        return 'com.au', 'Australian'
    elif predicted_speaker in ['welsh', 'british']:
        return 'co.uk', 'British'
    else:
        return 'co.in', 'Indian'

In [None]:
# Define a function to predict the accent using trained model and return the tld and predicted speaker

def predict_accent(file_path):
    
    speech_txt = transcribe(file_path)
    # Extract mfcc features of the data
    mfcc = extract_mfcc_features(file_path)
    
    # Reshape the array
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = mfcc.reshape(-1, 20)
    # Predict the accent
    speaker = ['american','welsh','telugu','bangla','australian','british','odiya',
               'indian','malayalam']
    
    ann = load_model("accent_classification.hdf5")
    predicted_label = np.argmax(ann.predict(mfcc), axis=-1)[0]
    predicted_speaker = speaker[predicted_label]

    domain, accent = get_domain(predicted_speaker)

    return (speech_txt, str(domain), accent)

In [None]:
# Testing the function with a sample audio file of known accent
predict_accent('accentdb_extended/data/indian/speaker_02/indian_s02_709.wav')

# Define function for converting text to speech using Google Cloud TTS

In [None]:
# Set language for text-to-speech
language = 'en'

# Define function for converting text to speech using Google Cloud TTS
def generate_audio(out_text, accent_tld):
    audio_obj = gTTS(text= out_text,
                    lang = language,
                     tld = accent_tld,
                    slow = False)
    audio_obj.save("Temp.mp3")
    return "Temp.mp3"

# Define function for running the Voice Assistant 

In [None]:
# Define a dictionary to store session history
history = {}

# Define a function for running the Voice Assistant
def voice_assistant(session_id, audio):
    # Transcribe the audio and Predict the accent
    text, accent, prediction = predict_accent(audio)
    
    # Check if the session ID is already in the History
    if session_id not in history:
        history[session_id] = []
    
    # Get the chat history for the session ID
    chat_history = history[session_id]
    
    # Append the user's input to the chat history
    chat_history.append(text)
    
    # Generate a response
    response = generate_response(str(chat_history))
    
    # Append the response to the chat history
    chat_history.append(response)
    
    # Generate an audio file from the response
    audio_file = generate_audio(response, accent)

    return (text, response, prediction, audio_file)

# Gradio UI

In [None]:
output1 = gr.Textbox(placeholder="Start recording to ask a question.", 
                     label="Speech to Text")
output2 = gr.Textbox(placeholder="As an AI Voice assistant, I can remember conversations and respond to any follow-up corrections, answer questions, provide information, and generate creative content like jokes, puns, and poetry.", 
                     label="GPT Reply")
output3 = gr.Textbox(label="Identified Accent:", 
                     placeholder="An Accent similar to your dialect will be displayed here.")
output4 = gr.outputs.Audio(label="Audio Output", type = "filepath")

user_interface = gr.Interface(
    fn=voice_assistant,
    inputs=[gr.Textbox(label="Session ID", placeholder="Enter a @user_name for yourself to get personalized experience. This will allow the model to respond to follow-up commands.", type= "text"), gr.Audio(source="microphone", type="filepath", label="To ask another question, delete the current question and resubmit the question.", placeholder="Ask a question by submitting the audio.")],
    outputs=[output1, output2, output3, output4],
    #live=True,
    title="Rishi's AI Voice Assistant",
    description="""
    Example questions you may ask:
    
    speak,"Who is lead actor in 'Titanic'?"
    speak,"Let's play a game of Hangman!"
    speak,"What is 13 times 27.6?"
    speak,"Can you help me make a comic?"
    """,
    allow_flagging="never")

In [None]:
user_interface.launch(share=True)

In [None]:
statistics(x)=(pret, pvol, pret/pvol)

In [None]:
staistics(x)[0]= [(pret, pvol, pret/pvol)[1],(pret, pvol, pret/pvol)[2],(pret, pvol, pret/pvol)...,(pret, pvol, pret/pvol)]

In [None]:
run sco minimise

results= res[x]
res[x] = [(w,r), (w,r), (w,r),...,(w,r)]

In [None]:
statistics(weights, riskFree)