In [1]:
import pandas as pd

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# model_id = "openai/whisper-large-v3"
# model_id = "openai/whisper-small"
model_id = "openai/whisper-medium"
# model_id = "openai/whisper-tiny"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import os
basepath = 'data/english_course_wma/Wizard Inglês Modulo 3 - Conversation'

_, _, filenames = next( os.walk(basepath) )
filenames

['Wizard-Lesson 13-13.wma',
 'Wizard-Lesson 11-11.wma',
 'Wizard-Lesson 18-18.wma',
 'Wizard-Lesson 12-12.wma',
 'Wizard-Lesson 22-22.wma',
 'Wizard-Lesson 17-17.wma',
 'Wizard-Lesson 7-07.wma',
 'Wizard-Lesson 15-15.wma',
 'Wizard-Lesson 10-10.wma',
 'Wizard-Lesson 14-14.wma',
 'Wizard-Lesson 5-05.wma',
 'Wizard-Lesson 3-03.wma',
 'Wizard-Lesson 25-25.wma',
 'Wizard-Lesson 20-20.wma',
 'Wizard-Lesson 16-16.wma',
 'Wizard-Lesson 6-06.wma',
 'Wizard-Lesson 2-02.wma',
 'Wizard-Lesson 21-21.wma',
 'Wizard-Lesson 4-04.wma',
 'Wizard-Lesson 23-23.wma',
 'Wizard-Lesson 30-30.wma',
 'Wizard-Lesson 28-28.wma',
 'Wizard-Lesson 29-29.wma',
 'Wizard-Lesson 26-26.wma',
 'Wizard-Lesson 9-09.wma',
 'Wizard-Lesson 19-19.wma',
 'Wizard-Lesson 8-08.wma',
 'Wizard-Lesson 24-24.wma',
 'Wizard-Lesson 1-01.wma',
 'Wizard-Lesson 27-27.wma']

In [4]:
filenames = sorted( filenames )

In [9]:
for filename in filenames[5:]:
    data = []
    print( filename )
    complete_path = os.path.join(basepath, filename)

    result = pipe(complete_path, generate_kwargs={"language": "english"})

    for i, row in enumerate( result['chunks'] ):
        # print( row['text'] )

        data.append( row["text"] )

    df = pd.read_csv("transcript.csv")

    joined = " ".join(data)
    row = { 
        "Filename": filename,
        "Sentences": joined
        }

    df = pd.concat( [df, pd.DataFrame([row])], ignore_index=True )

    # print(df)
    df.to_csv( "transcript.csv", index=False )

Wizard-Lesson 14-14.wma


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Wizard-Lesson 15-15.wma
Wizard-Lesson 16-16.wma
Wizard-Lesson 17-17.wma
Wizard-Lesson 18-18.wma
Wizard-Lesson 19-19.wma
Wizard-Lesson 2-02.wma
Wizard-Lesson 20-20.wma


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Wizard-Lesson 21-21.wma
Wizard-Lesson 22-22.wma
Wizard-Lesson 23-23.wma


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Wizard-Lesson 24-24.wma
Wizard-Lesson 25-25.wma


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Wizard-Lesson 26-26.wma
Wizard-Lesson 27-27.wma
Wizard-Lesson 28-28.wma
Wizard-Lesson 29-29.wma
Wizard-Lesson 3-03.wma
Wizard-Lesson 30-30.wma
Wizard-Lesson 4-04.wma
Wizard-Lesson 5-05.wma
Wizard-Lesson 6-06.wma


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Wizard-Lesson 7-07.wma
Wizard-Lesson 8-08.wma
Wizard-Lesson 9-09.wma


In [None]:
# df.to_csv( "transcript.csv" )

In [8]:
df

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Filename,Sentences
0,0.0,0.0,0.0,0.0,0.0,Wizard-Lesson 1-01.wma,Music Wizard Book 3 Conversation Lesson 1 D...
1,1.0,1.0,1.0,1.0,1.0,Wizard-Lesson 10-10.wma,"Conversation, Lesson 10, is he younger or old..."
2,2.0,2.0,2.0,2.0,,Wizard-Lesson 11-11.wma,Lesson 11 Conversation Lesson 11 Lesson 11 ...
3,3.0,3.0,3.0,,,Wizard-Lesson 12-12.wma,Conversation Lesson 12 She put her sunglass...
4,,,,,,Wizard-Lesson 12-12.wma,Conversation Lesson 12 She put her sunglass...
5,,,,,,Wizard-Lesson 13-13.wma,Conversation Lesson 13 He invited Sandy to ...
6,,,,,,Wizard-Lesson 14-14.wma,Conversation Lesson 14 She left her laptop ...


In [None]:
print( df.iloc[1,2] )

In [None]:
print( df.iloc[2,2] )