In [None]:
import math
import seaborn as sns
import os
import wave

sns.set(style="whitegrid",color_codes=True)

from wordcloud import WordCloud, STOPWORDS

import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline
import matplotlib.pyplot as plt 
plt.rcParams["figure.figsize"] = [16, 12]
from subprocess import check_output

import chardet

np.random.seed(0)

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

import librosa
import IPython.display as ipd
from scipy.io import wavfile
import warnings
warnings.filterwarnings("ignore")

from pyspark.sql import SparkSession
from pyspark.sql.types import *

import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [None]:
schema=StructType([
    StructField("filename",StringType(),True),
    StructField("text",StringType(),True)
    
])

In [None]:
spark=SparkSession.builder.appName("speech to text").getOrCreate()
sc=spark.sparkContext
df=spark.read.csv("Trans1.txt",header=False,schema=schema)
df.show()

In [None]:
df2=df.randomSplit([0.67,0.33], seed=5)
print(df.count())
print(df2[0].count())
print(df2[1].count())

In [None]:
df = pd.read_csv('Trans1.txt',header=None)

In [None]:
df['Filename'] = df[0].str.split(expand=True)[0]

transcriptions = []
for i in df[0]:
    s = ''
    k = i.split()
    s = ' '.join(k[1:])
    transcriptions.append(s)
df['Transcription'] = transcriptions
df.drop(0,axis = 1,inplace = True)
df

In [None]:
df.dtypes

In [None]:
string = ' '.join(df['Transcription'])
wordcloud2 = WordCloud().generate(string)
plt.imshow(wordcloud2)

In [None]:
schema=StructType([
    StructField("filename",StringType(),True),
    StructField("text",StringType(),True)
    
])

In [None]:
spark=SparkSession.builder.appName("speech to text").getOrCreate()
sc=spark.sparkContext
df=spark.read.csv("Trans2.txt",header=False,schema=schema)
df.show()

In [None]:
df = pd.read_csv('Trans2.txt',header=None)

In [None]:
df['Filename'] = df[0].str.split(expand=True)[0]

transcriptions = []
for i in df[0]:
    s = ''
    k = i.split()
    s = ' '.join(k[1:])
    transcriptions.append(s)
df['Transcription'] = transcriptions
df.drop(0,axis = 1,inplace = True)
df

In [None]:
df.dtypes

In [None]:
string = ' '.join(df['Transcription'])
wordcloud2 = WordCloud().generate(string)
plt.imshow(wordcloud2)

# TRAINING MODEL

In [None]:
train_audio_path = 'LibriSpeech/dev-clean'
labels=os.listdir(train_audio_path)

all_wave = []
all_label = []
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.flac')]
    for wav in waves:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
        samples = librosa.resample(samples, sample_rate, 8000)
        if(len(samples)== 8000) : 
            all_wave.append(samples)
            all_label.append(label)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)

In [None]:
from keras.utils import np_utils
y=np_utils.to_categorical(y, num_classes=len(labels))
all_wave = np.array(all_wave).reshape(-1,8000,1)

In [None]:
from sklearn.model_selection import train_test_split
#x_tr, x_val, y_tr, y_val = train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size = 0.2,random_state=777,shuffle=True)

In [None]:

from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
K.clear_session()

inputs = Input(shape=(8000,1))

#First Conv1D layer
conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Second Conv1D layer
conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Third Conv1D layer
conv = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Fourth Conv1D layer
conv = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Flatten layer
conv = Flatten()(conv)

#Dense Layer 1
conv = Dense(256, activation='relu')(conv)
conv = Dropout(0.3)(conv)

#Dense Layer 2
conv = Dense(128, activation='relu')(conv)
conv = Dropout(0.3)(conv)

outputs = Dense(len(labels), activation='softmax')(conv)

model = Model(inputs, outputs)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
mc = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
#history=model.fit(x_tr, y_tr ,epochs=100, callbacks=[es,mc], batch_size=32, validation_data=(x_val,y_val))

In [None]:
k="SK2YZLMPFWFWY2VZMC76IVIDWVHSWWJC"
def transcription(filename):
	r = sr.Recognizer()
	with sr.AudioFile(filename) as source:
		#r.adjust_for_ambient_noise(source)
	    audio_data = r.record(source)
	    text = r.recognize_wit(audio_data, key=k)
	return text

In [None]:
def writeTranscript(filename,content):
	with open(filename,"a") as file:
	    file.write(content.upper()+"\n\n")

In [None]:
for i in range(9):
    if i<10:
        filename = "flac samples/84-121123-000"+str(i)+".flac"
    else:
        filename = "flac samples/84-121123-00"+str(i)+".flac"
    t=transcription(filename)
    writeTranscript("PTrans.txt",t)
    print("Full text:", t)

## For Large Audio Files

In [None]:
k="HXLB4JAVE7QXG3VNU2EF5TSA6KMLUNBG"
def get_large_audio_transcription(path):
    """
    Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    """
    r = sr.Recognizer()
    # open the audio file using pydub
    sound = AudioSegment.from_file(path)  
    # split audio sound where silence is 700 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 300,
        # adjust this per requirement
        silence_thresh = sound.dBFS-12,
        # keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_wit(audio_listened,key=k)
            except sr.UnknownValueError as e:
                print("Error:", str(e))
            else:
                text = f"{text.capitalize()}. "
                print(chunk_filename, ":", text)
                whole_text += text
    # return the text for all chunks detected
    return whole_text

In [None]:
for i in range(4):
	#path = "audio-chunks/sample-00000"+str(i)+".mp3"
	path = "flac samples/84-121123-000"+str(i)+".flac"
	print("Full text:", get_large_audio_transcription(path),end="\n\n")

In [None]:
df1 = pd.read_csv('Trans.txt',header=None)

In [None]:
df1['Filename'] = df1[0].str.split(expand=True)[0]

transcriptions = []
for i in df1[0]:
    s = ''
    k = i.split()
    s = ' '.join(k[1:])
    transcriptions.append(s)
df1['Transcription'] = transcriptions
df1.drop(0,axis = 1,inplace = True)
df1

In [None]:
s1 = ' '.join(df1['Transcription'])
WC1 = WordCloud().generate(s1)
plt.imshow(WC1)

In [None]:
df2 = pd.read_csv('PTrans.txt', header = None)
df2['Transcription'] = df2[0]
df2.drop(0,axis = 1,inplace = True)
df2

In [None]:
s2 = ' '.join(df2['Transcription'])
WC2 = WordCloud().generate(s2)
plt.imshow(WC2)

In [None]:
def word_count(s):
    d = {}
    for i in s:
        if i in d:
            d[i] += 1
        else:
            d[i] = 1
    return d

In [None]:
def accuracy_test(df1, df2):
    matches = 0
    mismatches = 0
    for i in range(len(df2)):
            s1 = df1['Transcription'].iloc[i].split()
            s2 = df2['Transcription'].iloc[i].split()
            l1 = len(s1)
            l2 = len(s2)
            d1 = word_count(s1)
            d2 = word_count(s2)
            for k in d1:
                if k in d2:
                    if d1[k] == d2[k]:
                        matches += d1[k]
                    else:
                        m = max(d1[k], d2[k])
                        mismatches += ((m - d1[k]) + (m - d2[k]))
            for k in d2:
                if k not in d1:
                    mismatches += d2[k]
    print((matches/(matches+mismatches))*100)

In [None]:
accuracy_test(df1,df2)