In [22]:
import os
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

In [23]:
# import os

# def convert_to_txt(input_folder, output_folder):
#     # Obtener la lista de archivos en la carpeta de entrada
#     file_list = os.listdir(input_folder)

#     for file_name in file_list:
#         input_file_path = os.path.join(input_folder, file_name)

#         # Verificar si es un archivo regular y si la extensión no es .txt
#         if os.path.isfile(input_file_path) and not file_name.endswith('.txt'):
#             # Leer el contenido del archivo
#             with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as original_file:
#                 content = original_file.read()

#             # Crear un nuevo archivo .txt en la carpeta de salida
#             output_file_path = os.path.join(output_folder, file_name + '.txt')
#             with open(output_file_path, 'w', encoding='utf-8') as txt_file:
#                 txt_file.write(f"$$$$ FILE_NAME: {file_name}$$$$\n{content}")

# # Ruta de la carpeta de entrada que contiene los archivos a convertir
# input_folder = 'C:/Users/felip/Documents/Concurso'

# # Ruta de la carpeta de salida donde se guardarán los archivos de texto (.txt)
# output_folder = 'docs'

# # Crear la carpeta de salida si no existe
# if not os.path.exists(output_folder):
#     os.makedirs(output_folder)

# convert_to_txt(input_folder, output_folder)


In [24]:
os.environ["OPENAI_API_BASE"] = os.environ["AZURE_OPENAI_ENDPOINT"]
os.environ["OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_KEY"]
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["OPENAI_API_TYPE"] = "azure"

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") 
openai.api_version = "2023-05-15"
openai.api_key = os.getenv("AZURE_OPENAI_KEY")
deployment_name = "codecompass-gpt-16"
emb_name = "codecompass_emb"

In [26]:
llm = AzureChatOpenAI(model_name="gpt-3.5-turbo-16k", deployment_name=deployment_name)

In [27]:
directory = "docs"
loader = DirectoryLoader(directory)
docs = loader.load()

In [44]:
def get_hierarchy(directorio, nivel=0, max_archivos=15):
    if not os.path.exists(directorio):
        return "El directorio no existe"

    if not os.path.isdir(directorio):
        return "La ruta proporcionada no es un directorio"

    resultado = ""

    for item in os.listdir(directorio):
        item_ruta = os.path.join(directorio, item)
        indentacion = "  " * nivel

        if os.path.isfile(item_ruta):
            resultado += f"{indentacion}- {item}\n"
        elif os.path.isdir(item_ruta):
            resultado += f"{indentacion}+ {item}/\n"
            if nivel < max_archivos:
                resultado += get_hierarchy(item_ruta, nivel=nivel + 1, max_archivos=max_archivos)

    return resultado

directorio_a_analizar = 'C:/Users/felip/Documents/Concurso'
hierarchy = get_hierarchy(directorio_a_analizar, max_archivos=15)
# print(hierarchy)


In [45]:
file_name = "converter.py"
prompt_template = f"""You are an assistant that helps developers document \
    or answer questions about software development projects. The project's \
    current folder hierarchy is denoted by #### characters and is the following: \
    ####{hierarchy}####.\n
    You have to document the following text that could be a piece of code or \
    a comment about the code, or simple text. The text is denoted by triple backticks \
    and is the following: \n
    ```{{text}}```\n
    1. The name of this file is at the beggining sorrounded by $$$$. \n
    2. Make sure that the documentation is correct and that it is consistent with the \
    project's hierarchy.
    3. If you are not sure about the documentation, don't invent \
    anything and just write what you know. 
    4. Add the file name at the beggining of the documentation and use the format .md. \
    5. Don't put the hierarchy in any documentation.
    6. Be the more concise as possible. 
    DOCUMENTATION:"""
prompt = PromptTemplate.from_template(prompt_template)

In [46]:
llm_chain = LLMChain(llm=llm, prompt=prompt)
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain, document_variable_name="text",
)

In [49]:
chunk_size = 5000
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=chunk_size, chunk_overlap=100)

In [55]:
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain

In [68]:
# Get docs with more than 5000 chunks
large_docs = []
summarizes = []
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=7000, chunk_overlap=100)
summary_chain = load_summarize_chain(llm=llm, chain_type='map_reduce')

for i, doc in enumerate(docs):
    splitted_doc = []
    summarized = ""
    if llm.get_num_tokens(doc.page_content) >= chunk_size:
        splitted_doc = text_splitter.split_documents([doc])
        summarized = summary_chain.run(splitted_doc)
        summarizes.append(summarized)
        md_file_path = f'documentation/{i}.md'
        save_text_as_md(summarized, md_file_path)
    else:
        text = stuff_chain.run([doc])
        md_file_path = f'documentation/{i}.md'
        save_text_as_md(text, md_file_path)

In [62]:
summarizes

['The code in the "neoqa.ipynb" file includes imports of libraries and modules for natural language processing and question-answering. It defines a class called "Query" with methods for loading and splitting documents, creating a database, and making queries. The code uses a sentence transformer model and a Chroma vector store for similarity search, and sets environment variables for the OpenAI API. The "make_query" method performs a similarity search and returns matching documents. The code encountered a ValueError due to missing input keys in the query.',
 'The given code imports libraries, sets environment variables, loads data, and defines functions for natural language processing tasks. However, it throws a ValueError due to missing input keys. The code includes various functions and modules related to error handling, text manipulation, question answering, and retrieval. Some parts of the code are commented out. The code file is in IPython format with a version of 3 and can be exp