In [7]:
import pandas as pd

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# model_id = "openai/whisper-large-v3"
# model_id = "openai/whisper-small"
# model_id = "openai/whisper-medium"
model_id = "openai/whisper-tiny"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import os
basepath = 'data/english_course_wma/Wizard Inglês Modulo 3 - Conversation'

_, _, filenames = next( os.walk(basepath) )
filenames

['Wizard-Lesson 13-13.wma',
 'Wizard-Lesson 11-11.wma',
 'Wizard-Lesson 18-18.wma',
 'Wizard-Lesson 12-12.wma',
 'Wizard-Lesson 22-22.wma',
 'Wizard-Lesson 17-17.wma',
 'Wizard-Lesson 7-07.wma',
 'Wizard-Lesson 15-15.wma',
 'Wizard-Lesson 10-10.wma',
 'Wizard-Lesson 14-14.wma',
 'Wizard-Lesson 5-05.wma',
 'Wizard-Lesson 3-03.wma',
 'Wizard-Lesson 25-25.wma',
 'Wizard-Lesson 20-20.wma',
 'Wizard-Lesson 16-16.wma',
 'Wizard-Lesson 6-06.wma',
 'Wizard-Lesson 2-02.wma',
 'Wizard-Lesson 21-21.wma',
 'Wizard-Lesson 4-04.wma',
 'Wizard-Lesson 23-23.wma',
 'Wizard-Lesson 30-30.wma',
 'Wizard-Lesson 28-28.wma',
 'Wizard-Lesson 29-29.wma',
 'Wizard-Lesson 26-26.wma',
 'Wizard-Lesson 9-09.wma',
 'Wizard-Lesson 19-19.wma',
 'Wizard-Lesson 8-08.wma',
 'Wizard-Lesson 24-24.wma',
 'Wizard-Lesson 1-01.wma',
 'Wizard-Lesson 27-27.wma']

In [3]:
filenames = sorted( filenames )

In [4]:
data = []
for filename in filenames[1:2]:
    print( filename )
    complete_path = os.path.join(basepath, filename)

    result = pipe(complete_path, generate_kwargs={"language": "english"})

    for i, row in enumerate( result['chunks'] ):
        # print( row['text'] )
    
        row = { 
            "Filename": filename,
            "Sentence order": i,
            "Sentence": row["text"]
            }
        data.append( row )

# data

Wizard-Lesson 10-10.wma


In [5]:
# import pandas as pd

# df = pd.DataFrame(data)
# df

In [9]:
df = pd.read_csv("transcript.csv")
df

Unnamed: 0.1,Unnamed: 0,Filename,Sentence order,Sentence
0,0,Wizard-Lesson 1-01.wma,0,Music
1,1,Wizard-Lesson 1-01.wma,1,Wizard Book 3 Conversation Lesson 1
2,2,Wizard-Lesson 1-01.wma,2,Did the boy pay $1 for the paper at the newss...
3,3,Wizard-Lesson 1-01.wma,3,"Yes, he did. He paid $1 for the paper at the ..."
4,4,Wizard-Lesson 1-01.wma,4,Did this secretary put the document on the ta...
5,5,Wizard-Lesson 1-01.wma,5,"Yes, she did."
6,6,Wizard-Lesson 1-01.wma,6,She put the document on the table.
7,7,Wizard-Lesson 1-01.wma,7,"No, she didn't."
8,8,Wizard-Lesson 1-01.wma,8,She didn't put the information from the compu...
9,9,Wizard-Lesson 1-01.wma,9,"Yes, I did. I accessed the information from t..."


In [32]:
df = pd.concat( [df, pd.DataFrame(data)], ignore_index=True )

In [33]:
df.tail()

Unnamed: 0,Filename,Sentence order,Sentence
59,Wizard-Lesson 10-10.wma,23,What about a Porsche?
60,Wizard-Lesson 10-10.wma,24,Isn't a Ferrari faster?
61,Wizard-Lesson 10-10.wma,25,What is the largest state in the US?
62,Wizard-Lesson 10-10.wma,26,What about California?
63,Wizard-Lesson 10-10.wma,27,Isn't Texas larger


In [10]:
j=[]
for e in df[df["Filename"] == 'Wizard-Lesson 1-01.wma' ]['Sentence']:
    print(e)
    j.append(e)

j


 Music
 Wizard Book 3 Conversation Lesson 1
 Did the boy pay $1 for the paper at the newsstand?
 Yes, he did. He paid $1 for the paper at the newsstand.
 Did this secretary put the document on the table?
 Yes, she did.
 She put the document on the table.
 No, she didn't.
 She didn't put the information from the computer?
 Yes, I did. I accessed the information from the computer.
 No, I didn't. I didn't access the information from the computer.
 Did your mom do her Christmas shopping early this year?
 Yes, she did.
 She did her Christmas shopping early this year.
 No, she didn't. She didn't do her Christmas shopping early this year.
 Did they eat Brazilian food at the restaurant? Yes, they did.
 They ate Brazilian food at the restaurant.
 No, they didn't.
 They didn't eat Brazilian food at the restaurant. Is your sister doing her homework in the her bedroom?
 Yes, she is.
 She is doing her homework in her bedroom.
 No, she isn't. She isn't doing her homework in her bedroom.
 Are the chi

[' Music',
 ' Wizard Book 3 Conversation Lesson 1',
 ' Did the boy pay $1 for the paper at the newsstand?',
 ' Yes, he did. He paid $1 for the paper at the newsstand.',
 ' Did this secretary put the document on the table?',
 ' Yes, she did.',
 ' She put the document on the table.',
 " No, she didn't.",
 " She didn't put the information from the computer?",
 ' Yes, I did. I accessed the information from the computer.',
 " No, I didn't. I didn't access the information from the computer.",
 ' Did your mom do her Christmas shopping early this year?',
 ' Yes, she did.',
 ' She did her Christmas shopping early this year.',
 " No, she didn't. She didn't do her Christmas shopping early this year.",
 ' Did they eat Brazilian food at the restaurant? Yes, they did.',
 ' They ate Brazilian food at the restaurant.',
 " No, they didn't.",
 " They didn't eat Brazilian food at the restaurant. Is your sister doing her homework in the her bedroom?",
 ' Yes, she is.',
 ' She is doing her homework in her 

In [13]:
b = " ".join(j)
b

" Music  Wizard Book 3 Conversation Lesson 1  Did the boy pay $1 for the paper at the newsstand?  Yes, he did. He paid $1 for the paper at the newsstand.  Did this secretary put the document on the table?  Yes, she did.  She put the document on the table.  No, she didn't.  She didn't put the information from the computer?  Yes, I did. I accessed the information from the computer.  No, I didn't. I didn't access the information from the computer.  Did your mom do her Christmas shopping early this year?  Yes, she did.  She did her Christmas shopping early this year.  No, she didn't. She didn't do her Christmas shopping early this year.  Did they eat Brazilian food at the restaurant? Yes, they did.  They ate Brazilian food at the restaurant.  No, they didn't.  They didn't eat Brazilian food at the restaurant. Is your sister doing her homework in the her bedroom?  Yes, she is.  She is doing her homework in her bedroom.  No, she isn't. She isn't doing her homework in her bedroom.  Are the ch

In [27]:
df.to_csv( os.path.join( "data_transcript", filename.split(".")[0]+".csv" ) )