# Summarize

Summarize Teams conversations with ChatGPT with VTT files

https://webvtt-py.readthedocs.io/en/latest/usage.html

In [1]:
import os
import webvtt

In [2]:
os.listdir('../vtt')

['YannMike_2023-03-08.vtt']

In [3]:
file = 'YannMike_2023-03-08.vtt'
chat = webvtt.read('../vtt/'+file)
for caption in chat[0:10]:
    # print(f'From {caption.start} to {caption.end}')
    print(caption.raw_text)
    # print(caption.text)

<v Yann>Why is my French accent?</v>
<v Yann>I I want to stop doing this experiment where.</v>
<v Yann>We some a conversation we record it's generating a VTT file.</v>
<v Yann>And I have a a a parcel. I developed a small app in Python that can retrieve the VTT like process it OK.</v>
<v Yann>And then.</v>
<v Yann>I want to feed it through the to GPT stuff API and the API is amazing. I don't know if you've played around with it already.</v>
<v Mike>No, I need to though the. Yeah, the problem is I can't find too many things. I promised people so every day.</v>


In [4]:
txt = 'YannMike_2023-03-08.txt'
# for caption in chat:
#     with open('txt/'+txt,mode='a') as f:
#         f.write(caption.text+'\n')

str = []
for caption in chat:
    str.append(caption.text)
sep = '\n'
convo = sep.join(str)
with open('../txt/'+txt,mode='w') as f:
    f.write(convo)

## How many Tokens?
- https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
- https://github.com/openai/tiktoken
- https://platform.openai.com/tokenizer

In [5]:
# first order
tokens = convo.split() # split the string into tokens
num_tokens = len(tokens)
num_tokens

# # pip install nltk
# import nltk
# len(nltk.word_tokenize(convo))

98

In [6]:
import tiktoken
encoding_name = 'cl100k_base'
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(convo))
num_tokens

118

## Summarize

In [8]:
import os
import tomli
import openai
with open('../.streamlit/secrets.toml','rb') as f:
    toml_dict = tomli.load(f)
openai.api_key = toml_dict['OPEN_AI_KEY']
os.environ['OPENAI_API_KEY'] = toml_dict['OPEN_AI_KEY']

In [9]:
context = 'summarize the following conversation'
model = 'gpt-3.5-turbo'
# model = 'gpt-4'
completion = openai.ChatCompletion.create(
    model=model,
      messages=[
        {'role': 'system','content': context},
        {'role': 'user', 'content': convo}
            ]
)
completion.choices[0].message.content

'The conversation appears to be about using an app in Python to retrieve a VTT file generated from a recorded conversation, and then feeding it through the GPT stuff API. The speaker mentions their French accent and expresses difficulty in finding information on the topic. The second person expresses interest in the API and mentions needing to try it out.'

## Summarize with LangChain

In [20]:
from langchain.llms import OpenAI
llm = OpenAI()
llm

OpenAI(cache=None, verbose=False, callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x000001C723A661D0>, client=<class 'openai.api_resources.completion.Completion'>, model_name='text-davinci-003', temperature=0.7, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0, n=1, best_of=1, model_kwargs={}, openai_api_key=None, batch_size=20, request_timeout=None, logit_bias={}, max_retries=6, streaming=False)

In [44]:
print(llm('tell me a joke'))



Q: What did the fish say when it hit the wall?
A: Dam!


In [19]:
from langchain.chat_models import ChatOpenAI
chat = ChatOpenAI(model_name='gpt-3.5-turbo')
chat

ChatOpenAI(verbose=False, callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x000001C723A661D0>, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', temperature=0.7, model_kwargs={}, openai_api_key=None, request_timeout=60, max_retries=6, streaming=False, n=1, max_tokens=None)

In [62]:
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("../txt/YannMike_2023-03-08.txt")
doc = loader.load()

In [63]:
doc[0].dict()['page_content']

"Why is my French accent?\n\nI I want to stop doing this experiment where.\n\nWe some a conversation we record it's generating a VTT file.\n\nAnd I have a a a parcel. I developed a small app in Python that can retrieve the VTT like process it OK.\n\nAnd then.\n\nI want to feed it through the to GPT stuff API and the API is amazing. I don't know if you've played around with it already.\n\nNo, I need to though the. Yeah, the problem is I can't find too many things. I promised people so every day."

In [64]:
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(chat,
                             chain_type='stuff',
                             verbose=True,
                             )
chain.run(doc)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"Why is my French accent?

I I want to stop doing this experiment where.

We some a conversation we record it's generating a VTT file.

And I have a a a parcel. I developed a small app in Python that can retrieve the VTT like process it OK.

And then.

I want to feed it through the to GPT stuff API and the API is amazing. I don't know if you've played around with it already.

No, I need to though the. Yeah, the problem is I can't find too many things. I promised people so every day."


CONCISE SUMMARY:[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'The speaker is discussing an experiment where they record conversations and generate a VTT file, which they then retrieve and process using a Python app they developed. They plan to feed it through the GPT API, but are struggling to find enough information about it. They mention having a French accent, but do not elaborate further.'

In [58]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 3000,
    chunk_overlap = 200
)

loader = UnstructuredFileLoader("../txt/chat.txt")
doc = loader.load()
docs = text_splitter.split_documents(doc)
len(docs)

18

In [None]:
chain = load_summarize_chain(chat,
                             chain_type='map_reduce',
                            #  verbose=True,
                             )
chain.run(docs)