In [1]:
from keras.models import load_model

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint


In [3]:
from keras.models import model_from_json
from keras.preprocessing.text import tokenizer_from_json
json_file = open('model.json', 'r')
loaded_text_model_json = json_file.read()
json_file.close()
loaded_text_model = model_from_json(loaded_text_model_json)

In [4]:
loaded_text_model.load_weights("text_model_weights.h5")

In [5]:
json_file = open('model_audio.json', 'r')
loaded_audio_model_json = json_file.read()
json_file.close()
loaded_audio_model = model_from_json(loaded_audio_model_json)

In [6]:
loaded_audio_model.load_weights("audio_model_weights.h5")

In [7]:
json_file = open('tokenizer.json','r')
l = json_file.read()
json_file.close()
tokenizer = tokenizer_from_json(l)

In [8]:
Features = pd.read_csv(r'features.csv')
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [9]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [10]:
d = {'1':'anger', '2':'boredom', '3':'empty', '4':'enthusiasm', '5':'fun', '6':'happiness',
       '7':'hate','8': 'love','9': 'neutral', '10':'relief', '11':'sadness','12': 'surprise',
       '13':'worry' }

In [11]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [12]:
def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data,sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data,sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch, sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [13]:
def audio_preprocess(data):
    feature = get_features(data)
    return(feature)

In [14]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import nltk

nltk.download('punkt')
nltk.download('wordnet')

oov_tok = "<oov_tok>"

def count_vectorizer(corpus):
    vectorizer = CountVectorizer(analyzer='word')
    corpus_words = vectorizer.fit_transform(list(corpus))
    return len(vectorizer.vocabulary_)


def get_tokenizer_obj(text_list, num_words):
    tokenizer = Tokenizer(lower=True, split=" ", num_words=num_words, oov_token=oov_tok)
    tokenizer.fit_on_texts(text_list)
    return tokenizer, len(tokenizer.word_index)


def tokenize_texts_to_sequences(tokenizer, text_list):
     return tokenizer.texts_to_sequences(text_list)
    
def padding_sequences(x_arr, max_len):
    x_arr = pad_sequences(x_arr, maxlen=max_len, value=0, padding='post')
    return x_arr 


def get_num_words(text):
    return count_vectorizer(text)


def get_max_statment_len(text):
    return len(text.split())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def text_preprocess(text):
    f_text = tokenize_texts_to_sequences(tokenizer, text)
    f_text = padding_sequences(f_text, 46)
    return(f_text)

In [16]:
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence
r = sr.Recognizer()
def audio_to_text(path):
    with sr.AudioFile(path) as source:
        r.adjust_for_ambient_noise(source)
        audio = r.listen(source)
    try:
        print(r.recognize_google(audio))
        return(r.recognize_google(audio)) 
    except Exception as e:
        print("Error {} : ".format(e) )



In [17]:
def complete_preprocess(data):
    text = audio_to_text(data)
    final_text = text_preprocess(text)
    new_data = audio_preprocess(data)
    return([final_text, new_data])
    
def most_frequent(List):
    return max(set(List), key = List.count)

In [25]:
def final_predict(data):
    text_pred, audio_pred = complete_preprocess(data)
    audio_pred_noise, audio_pred_strech, audio_pred_basic = audio_pred[1], audio_pred[2], audio_pred[0] 
    audio_pred = np.expand_dims(audio_pred, axis=2)
    audio_output = loaded_audio_model.predict(audio_pred)
    text_output = loaded_text_model.predict(text_pred)
    y_pred = np.argmax(text_output, axis=1)
    f = most_frequent(list(y_pred))
    audio_out = encoder.inverse_transform(audio_output)
    print("The speaker is {0} and {1}".format(audio_out[2][0],d[str(f)]))
    print("The audio model with noisy input {0} and {1}".format(audio_out[0][0],d[str(f)]))
    print("The audio model with streched input {0} and {1}".format(audio_out[1][0],d[str(f)]))

In [35]:
data = 'test.wav'
final_predict(data)

describe Priyam beautiful to make this life of wonderful Adventure of democracy that issues that unite


  return librosa.effects.time_stretch(data, rate)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


The speaker is calm and love
The audio model with noisy input calm and love
The audio model with streched input neutral and love
