In [19]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# model_id = "openai/whisper-large-v3"
# model_id = "openai/whisper-small"
# model_id = "openai/whisper-medium"
model_id = "openai/whisper-tiny"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
import os
basepath = 'data/english_course_wma/Wizard Inglês Modulo 3 - Conversation'

_, _, filenames = next( os.walk(basepath) )
filenames

['Wizard-Lesson 13-13.wma',
 'Wizard-Lesson 11-11.wma',
 'Wizard-Lesson 18-18.wma',
 'Wizard-Lesson 12-12.wma',
 'Wizard-Lesson 22-22.wma',
 'Wizard-Lesson 17-17.wma',
 'Wizard-Lesson 7-07.wma',
 'Wizard-Lesson 15-15.wma',
 'Wizard-Lesson 10-10.wma',
 'Wizard-Lesson 14-14.wma',
 'Wizard-Lesson 5-05.wma',
 'Wizard-Lesson 3-03.wma',
 'Wizard-Lesson 25-25.wma',
 'Wizard-Lesson 20-20.wma',
 'Wizard-Lesson 16-16.wma',
 'Wizard-Lesson 6-06.wma',
 'Wizard-Lesson 2-02.wma',
 'Wizard-Lesson 21-21.wma',
 'Wizard-Lesson 4-04.wma',
 'Wizard-Lesson 23-23.wma',
 'Wizard-Lesson 30-30.wma',
 'Wizard-Lesson 28-28.wma',
 'Wizard-Lesson 29-29.wma',
 'Wizard-Lesson 26-26.wma',
 'Wizard-Lesson 9-09.wma',
 'Wizard-Lesson 19-19.wma',
 'Wizard-Lesson 8-08.wma',
 'Wizard-Lesson 24-24.wma',
 'Wizard-Lesson 1-01.wma',
 'Wizard-Lesson 27-27.wma']

In [21]:
filenames = sorted( filenames )

In [28]:
data = []
for filename in filenames[1:2]:
    print( filename )
    complete_path = os.path.join(basepath, filename)

    result = pipe(complete_path, generate_kwargs={"language": "english"})

    for i, row in enumerate( result['chunks'] ):
        # print( row['text'] )
    
        row = { 
            "Filename": filename,
            "Sentence order": i,
            "Sentence": row["text"]
            }
        data.append( row )

# data

Wizard-Lesson 10-10.wma


In [25]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,Filename,Sentence order,Sentence
0,Wizard-Lesson 1-01.wma,0,Music
1,Wizard-Lesson 1-01.wma,1,Wizard Book 3 Conversation Lesson 1
2,Wizard-Lesson 1-01.wma,2,Did the boy pay $1 for the paper at the newss...
3,Wizard-Lesson 1-01.wma,3,"Yes, he did. He paid $1 for the paper at the ..."
4,Wizard-Lesson 1-01.wma,4,Did this secretary put the document on the ta...
5,Wizard-Lesson 1-01.wma,5,"Yes, she did."
6,Wizard-Lesson 1-01.wma,6,She put the document on the table.
7,Wizard-Lesson 1-01.wma,7,"No, she didn't."
8,Wizard-Lesson 1-01.wma,8,She didn't put the information from the compu...
9,Wizard-Lesson 1-01.wma,9,"Yes, I did. I accessed the information from t..."


In [32]:
df = pd.concat( [df, pd.DataFrame(data)], ignore_index=True )

In [33]:
df.tail()

Unnamed: 0,Filename,Sentence order,Sentence
59,Wizard-Lesson 10-10.wma,23,What about a Porsche?
60,Wizard-Lesson 10-10.wma,24,Isn't a Ferrari faster?
61,Wizard-Lesson 10-10.wma,25,What is the largest state in the US?
62,Wizard-Lesson 10-10.wma,26,What about California?
63,Wizard-Lesson 10-10.wma,27,Isn't Texas larger


In [39]:
for e in df[df["Filename"] == 'Wizard-Lesson 10-10.wma' ]['Sentence']:
    print(e)

 Conversation, Lesson 10, is he younger or older than her?
 He is younger than her.
 I thought he was older than her.
 How old do you think she is?
 I think she's 40.
 I don't think she's older than 35.
 What were they doing there? I don't know what they were doing there.
 I think they were checking their emails. Who is the youngest in your class?
 I think Karen is the youngest.
 Isn't James younger than she is?
 Who is the oldest student in your class?
 I think Mark is the oldest.
 Isn't Jim older than Mark?
 What is the most difficult class in school?
 I think it's math.
 Isn't it biology?
 What is the easiest class in school?
 I think it's English.
 Isn't it Portuguese?
 When is the best time for us to go there?
 What about 10 o'clock?
 Isn't it 11 o'clock better?
 What is the fastest car that you know?
 What about a Porsche?
 Isn't a Ferrari faster?
 What is the largest state in the US?
 What about California?
 Isn't Texas larger


In [27]:
df.to_csv( os.path.join( "data_transcript", filename ) )