# 듣고 질문에 답하기 유형 데이터 만들기

- 질문 생성 만들기
- 질문에 대한 오디오 만들기


## 질문 생성 Chain

In [1]:
import json
from typing import List

from tqdm.notebook import tqdm
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema import HumanMessage, AIMessage, StrOutputParser
import pandas as pd


For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
model = ChatOpenAI(model="gpt-3.5-turbo")

### 질문 주제 샘플링하기

In [3]:
csv_parser = CommaSeparatedListOutputParser()

In [4]:
csv_format_instruction = csv_parser.get_format_instructions()

In [5]:
subjet_prompt_template = PromptTemplate.from_template(template="영어 시험에 나올 법한 일상적인 주제를 단어 형식으로 만들어줘.\n{format_instruction}",
                                                      partial_variables={"format_instruction": csv_format_instruction})

In [6]:
subject_chain = subjet_prompt_template | model | csv_parser

In [7]:
subject_chain.invoke({})

APIConnectionError: Connection error.

In [14]:
#csv_parser를 통해 바로 생성.
csv_parser.invoke(model.invoke(f"영어 시험에 나올 법한 일상적인 주제를 단어 형식으로 만들어줘.\n{csv_format_instruction}"))

['daily routine',
 'school life',
 'hobbies',
 'family',
 'shopping',
 'food',
 'travel',
 'sports',
 'music',
 'movies']

In [15]:
subject_list = subject_chain.invoke({})

In [16]:
subject_list

['family',
 'school',
 'daily routine',
 'hobbies',
 'sports',
 'technology',
 'shopping',
 'travel',
 'environment',
 'health',
 'food',
 'relationships',
 'work']

In [17]:
subject_list = subject_list[:4]

In [18]:
subject_list

['family', 'school', 'daily routine', 'hobbies']

### 질문 만들기

In [19]:
model = ChatOpenAI(model="gpt-4-1106-preview")

In [20]:
template = """\
- 이전 질문들 {prev_questions}
- 이전 질문들과는 다른 유형으로 만들어줘
- 영어 시험에 나올 법한 {input} 주제에 관한 쉬운 질문 하나 만들어줘.
- 상대방과 연관지어 만들어줘
- 한 문장만 만들어줘
- 여러 예시 만들지마
- 영어로"""

#human message의 역할을 하는 프롬프트 템플릿
question_prompt_template = PromptTemplate.from_template(template=template)

In [21]:
question_chain = question_prompt_template | model | StrOutputParser()

In [22]:
#반복문을 통해 각 주제에 대한 질문 1개씩 생성. 이전 질문들을 참고하여, 다양한 지문을 생성.
question_list = []
for subject in tqdm(subject_list):
    question_list.append(question_chain.invoke({"input": subject, "prev_questions": question_list}))
    # question_list.append(question_chain.invoke({"input": subject}))

  0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
question_list

['Describe one activity you enjoy doing with your family.',
 'What is your favorite subject in school and why do you like it?',
 "How does your best friend's daily routine differ from yours?",
 'What hobby do you enjoy the most and how often do you engage in it with your friends or family?']

## 질문에 대한 오디오 파일 만들기

In [24]:
from openai import OpenAI

In [25]:
client = OpenAI()

In [26]:
def gen_speech_file(text, output_file_path):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy", # alloy, echo, fable, onyx, nova, and shimmer
        input=text
    )
    response.stream_to_file(output_file_path)

In [27]:
!mkdir -p ./data/speaking__listen_and_answer

명령 구문이 올바르지 않습니다.


In [28]:
save_dir = "./data/speaking__listen_and_answer"

In [29]:
question_list

['Describe one activity you enjoy doing with your family.',
 'What is your favorite subject in school and why do you like it?',
 "How does your best friend's daily routine differ from yours?",
 'What hobby do you enjoy the most and how often do you engage in it with your friends or family?']

In [30]:
record_list = []

for i, q in tqdm(enumerate(question_list), total=len(question_list)):
    output_file_path = f"{save_dir}/question_{i}.wav"
    gen_speech_file(q, output_file_path)

    record = {"question": q, "audio_file_path": output_file_path}
    record_list.append(record)

  0%|          | 0/4 [00:00<?, ?it/s]

  response.stream_to_file(output_file_path)


In [31]:
df = pd.DataFrame(record_list)
df

Unnamed: 0,question,audio_file_path
0,Describe one activity you enjoy doing with you...,./data/speaking__listen_and_answer/question_0.wav
1,What is your favorite subject in school and wh...,./data/speaking__listen_and_answer/question_1.wav
2,How does your best friend's daily routine diff...,./data/speaking__listen_and_answer/question_2.wav
3,What hobby do you enjoy the most and how often...,./data/speaking__listen_and_answer/question_3.wav


In [32]:
df.to_csv(f"{save_dir}/question_and_audio.csv", index=False)

In [33]:
from IPython.display import Audio

In [35]:
Audio(f"{save_dir}/question_0.wav")