### 导入依赖

In [25]:
from encoder.encoder import Encoder
from transcriptor.whisperx import WhisperX
from langchain.vectorstores.pgvector import PGVector
from langchain.docstore.document import Document
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import utils
import os
from dotenv import load_dotenv

load_dotenv("env/connection.env")

COLLECTION_NAME = "pt"  # or 'en'

### 将视频转换为音频文件

In [26]:
utils.convert_to_wav(f"data/{COLLECTION_NAME}.mp4")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

                                                                        

MoviePy - Done.




### 使用 WhiperX 转录音频，并创建将存储在 Postgres 中的文档

In [27]:
# whisperX in cpu is too slow, we used large whisper
whisperx = WhisperX(model_name="whisper")
transcription = whisperx.transcribe(f"data/{COLLECTION_NAME}.wav")

# create documents to store in Postgres
docs = [
    Document(page_content=f'start {item["start"]} - end {item["end"]}: {item["text"]}')
    for item in transcription["segments"]
]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to align segment (" that it would have been a little more difficult to do so. But I was able to"): backtrack failed, resorting to original...
Failed to align segment (" didn't know what to do for a few months. I was just thinking. And then I was back in"): backtrack failed, resorting to original...
Failed to align segment (" my old ways. I was a little bit of a coward. I was thinking of my old ways. I was thinking,"): backt

### 建立连接设置

In [28]:
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.getenv("DRIVER"),
    host=os.getenv("HOST"),
    port=os.getenv("PORT"),
    database=os.getenv("DATABASE"),
    user=os.getenv("USERNAME"),
    password=os.getenv("PASSWORD"),
)

### 将 embedding 后内容插入 Postgres

In [29]:
db = PGVector.from_documents(
    embedding=Encoder().encoder,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=True,  # deletes previous records, useful for testing
)

### 查询数据库

#### Huub use case with portuguese audio

In [30]:
similar_docs_pt = db.similarity_search("marcas e investimentos", k=4)
similar_docs_en = db.similarity_search("brands and investments", k=4)

print("Query PT: 'marcas e investimentos' | Top 4  results:")
print("\n".join([x.page_content for x in similar_docs_pt]))
print("Query EN: 'brands and investments' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_en]))

Query PT: 'marcas e investimentos' | Top 4  results:
start 361.82 - end 362.865:  to run the company with me.
start 366.16 - end 367.907:  visions of the future began to diverge.
start 462.8 - end 463.583:  current renaissance.
start 354.96 - end 355.823:  company you started?
Query EN: 'brands and investments' | Top 4 results:
start 354.96 - end 355.823:  company you started?
start 385.825 - end 389.68: I felt that I had let the previous generation of entrepreneurs down, that I had
start 338.18 - end 342.76:  years Apple had grown from just the two of us in a garage into a $2 billion company with
start 436.067 - end 439.74: During the next five years, I started a company named Next, another company


#### 使用英语音频的深度学习使用案例

In [31]:
similar_docs_pt = db.similarity_search("modelos de aprendizagem profunda", k=8)
similar_docs_en = db.similarity_search("deep learning models", k=8)

print("Query PT: 'modelos de aprendizagem profunda' | Top 8 result:")
print(similar_docs_pt[-1].page_content)
print("Query EN: 'deep learning models' | Top 1 result:")
print(similar_docs_en[0].page_content)

Query PT: 'modelos de aprendizagem profunda' | Top 8 result:
start 384.64 - end 384.861:  months.
Query EN: 'deep learning models' | Top 1 result:
start 195.14 - end 196.159:  Let me give you one example.


In [33]:
similar_docs_pt = db.similarity_search("distribuição normal", k=4)
similar_docs_en = db.similarity_search("normal distribution", k=4)

print("Query PT: 'distribuição normal' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_pt]))
print("Query EN: 'normal distribution' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_en]))

Query PT: 'distribuição normal' | Top 4 results:
start 175.54 - end 179.659:  I returned Coke bottles for the five cent deposits to buy food with, and
start 462.8 - end 463.583:  current renaissance.
start 6.248 - end 6.669:  Thank you.
start 279.04 - end 282.273:  personal computers might not have the wonderful typography that they do.
Query EN: 'normal distribution' | Top 4 results:
start 342.8 - end 344.125:  over 4,000 employees.
start 499.627 - end 502.839: And the only way to do great work is to love what you do.
start 492.723 - end 497.64:  Your work is gonna fill a large part of your life, and the only way to be truly satisfied is to do
start 351.049 - end 351.972: And then I got fired.
