In [1]:
import ast 
import re 
from azure.devops.connection import Connection
from msrest.authentication import BasicAuthentication
import os
from langchain_community.vectorstores.faiss import FAISS
import faiss  
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
from dotenv import load_dotenv
load_dotenv()

#PAT
personal_access_token = os.getenv("AZURE_DEVOPS_PAT")  
organization_url = 'https://dev.azure.com/shwetambhosale18'

# Authenticate using the personal access token
credentials = BasicAuthentication('', personal_access_token)
connection = Connection(base_url=organization_url, creds=credentials)

# Initialize the Git client
git_client = connection.clients.get_git_client()

In [3]:
def get_code_from_repo(repo_name, project_name, file_types=['.py', '.js']):
    repo = git_client.get_repository(project=project_name, repository_id=repo_name)
    
    # Get items (files and directories) from the repository root
    items = git_client.get_items(project=project_name, repository_id=repo.id, recursion_level='Full')

    if not items:
        print("No items found in the repository.")
    
    code = ""
    for item in items:
        if item.is_folder:
            print(f"Skipping folder: {item.path}")
        else:
            print(f"Found file: {item.path}")  # Debug print to show found file paths
            
            if any(item.path.endswith(ext) for ext in file_types):  # Filter by file type
                print(f"Fetching content of: {item.path}")  # Debug print to show file being processed
                
                # Get the content of each item (handle as a generator)
                file_content_generator = git_client.get_blob_content(project=project_name, repository_id=repo.id, sha1=item.object_id)
                
                # Collect content from the generator and decode bytes
                file_content = ''.join([chunk.decode('utf-8') for chunk in file_content_generator])

                if file_content:
                    code += file_content  # Append the content to the `code` string
                else:
                    print(f"No content found for file: {item.path}")  # Debug print if no content is found
    
    return code

In [6]:
def get_txt_text(txt_files_dir):
    text = ""  # Initialize an empty string to collect all the text

    # Traverse through all the files in the given directory
    for file_name in os.listdir(txt_files_dir):
        # Only process .txt files
        if file_name.endswith('.txt'):
            file_path = os.path.join(txt_files_dir, file_name)
            
            # Open and read the file content
            with open(file_path, 'r', encoding='utf-8') as f:
                file_content = f.read()
                text += file_content  # Append the file content to the main text variable
                
                print(f"Reading file: {file_name}")  # Optional debug print

    return text

In [7]:
get_txt_text("output_files")

Reading file: app.py.txt
Reading file: index.js.txt
Reading file: test1.py.txt
Reading file: test2_file_handling.py.txt


'import streamlit as st\n\nimport numpy as np\n\nimport tensorflow as tf\n\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder\n\nimport pandas as pd\n\nimport pickle\n\n\n\n# Load the trained model\n\nmodel = tf.keras.models.load_model(\'model.h5\')\n\n\n\n# Load the encoders and scaler\n\nwith open(\'label_encoder_gender.pkl\', \'rb\') as file:\n\n    label_encoder_gender = pickle.load(file)\n\n\n\nwith open(\'onehot_encoder_geo.pkl\', \'rb\') as file:\n\n    onehot_encoder_geo = pickle.load(file)\n\n\n\nwith open(\'scaler.pkl\', \'rb\') as file:\n\n    scaler = pickle.load(file)\n\n\n\n\n\n## streamlit app\n\nst.title(\'Customer Churn PRediction\')\n\n\n\n# User input\n\ngeography = st.selectbox(\'Geography\', onehot_encoder_geo.categories_[0])\n\ngender = st.selectbox(\'Gender\', label_encoder_gender.classes_)\n\nage = st.slider(\'Age\', 18, 92)\n\nbalance = st.number_input(\'Balance\')\n\ncredit_score = st.number_input(\'Credit Score\')\n\nestimated_sala

In [5]:
repo_name = "SPARK"
project_name = "SPARK"  # Replace with your actual project name

code_df = get_code_from_repo(repo_name, project_name)

print(code_df)

Skipping folder: /
Found file: /README.md
Found file: /app.py
Fetching content of: /app.py
File saved as: output_files\app.py.txt
Found file: /index.js
Fetching content of: /index.js
File saved as: output_files\index.js.txt
Found file: /test1.py
Fetching content of: /test1.py
File saved as: output_files\test1.py.txt
Found file: /test2_file_handling.py
Fetching content of: /test2_file_handling.py
File saved as: output_files\test2_file_handling.py.txt
All matching files have been processed.
None


In [6]:
# # Character wise splitting 
def code_to_chunks(code_text):
    text_splitter = CharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=500,
        separator="\n",
        length_function=len
    )
    chunks = text_splitter.split_text(code_text)
    return chunks

In [7]:
code_chunks = code_to_chunks(code_df)
code_chunks

["import streamlit as st\r\nimport numpy as np\r\nimport tensorflow as tf\r\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder\r\nimport pandas as pd\r\nimport pickle\r\n\r\n# Load the trained model\r\nmodel = tf.keras.models.load_model('model.h5')\r\n\r\n# Load the encoders and scaler\r\nwith open('label_encoder_gender.pkl', 'rb') as file:\r\n    label_encoder_gender = pickle.load(file)\r\n\r\nwith open('onehot_encoder_geo.pkl', 'rb') as file:\r\n    onehot_encoder_geo = pickle.load(file)\r\n\r\nwith open('scaler.pkl', 'rb') as file:\r\n    scaler = pickle.load(file)\r\n\r\n\r\n## streamlit app\r\nst.title('Customer Churn PRediction')\r\n\r\n# User input\r\ngeography = st.selectbox('Geography', onehot_encoder_geo.categories_[0])\r\ngender = st.selectbox('Gender', label_encoder_gender.classes_)\r\nage = st.slider('Age', 18, 92)\r\nbalance = st.number_input('Balance')\r\ncredit_score = st.number_input('Credit Score')\r\nestimated_salary = st.number_input('Est

In [8]:
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")



In [9]:
def get_code_vectorstore(chunks):
    all_embeddings = []

    for chunk in chunks:
        # Tokenize the chunk
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the embeddings from the last hidden state
        embeddings = outputs.last_hidden_state

        # Average the embeddings to get a fixed-size vector
        mean_embeddings = torch.mean(embeddings, dim=1).squeeze()

        # Store the mean embeddings
        all_embeddings.append(mean_embeddings)

    # Convert to numpy array
    embeddings = torch.stack(all_embeddings).detach().numpy()  # Convert to numpy array

    # Define the dimension of your embeddings
    embedding_dim = embeddings.shape[1]

    # Create a FAISS index
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance for similarity search

    # Add the embeddings to the index
    index.add(embeddings)

    # Now you can return the index along with the texts
    return index, chunks  # Return the index and the original chunks for reference


In [10]:
vectors = get_code_vectorstore(code_chunks)

In [11]:
vectors

(<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001640A9E04B0> >,
 ["import streamlit as st\r\nimport numpy as np\r\nimport tensorflow as tf\r\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder\r\nimport pandas as pd\r\nimport pickle\r\n\r\n# Load the trained model\r\nmodel = tf.keras.models.load_model('model.h5')\r\n\r\n# Load the encoders and scaler\r\nwith open('label_encoder_gender.pkl', 'rb') as file:\r\n    label_encoder_gender = pickle.load(file)\r\n\r\nwith open('onehot_encoder_geo.pkl', 'rb') as file:\r\n    onehot_encoder_geo = pickle.load(file)\r\n\r\nwith open('scaler.pkl', 'rb') as file:\r\n    scaler = pickle.load(file)\r\n\r\n\r\n## streamlit app\r\nst.title('Customer Churn PRediction')\r\n\r\n# User input\r\ngeography = st.selectbox('Geography', onehot_encoder_geo.categories_[0])\r\ngender = st.selectbox('Gender', label_encoder_gender.classes_)\r\nage = st.slider('Age', 18, 92)\r\nbalance = st.

# Users Query Embedding

In [12]:
def embed_query(query):
    # Tokenize the query
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True)
    
    # Get embeddings from CodeBERT
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Average the embeddings to get a fixed-size vector
    query_embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze()
    
    return query_embeddings.detach().numpy()


In [2]:
import os

# Generate a random secret key
secret_key = os.urandom(24)
print(secret_key)

b'\xac\xaa\xd6\xa2\x83\xf7h\xf4\xc9\xd7Q/\xf4@\x12\xfb\xff*\xf8\xe6b{\xc5\xc0'


In [13]:
embed_query("give me function for finding circle area")

array([-2.97147930e-02,  3.80501211e-01,  1.52565494e-01,  3.86769176e-01,
        6.13175809e-01, -1.56934738e-01, -6.54542027e-03,  3.08514088e-01,
        4.44860786e-01,  1.85581058e-01, -2.74414659e-01,  8.82109582e-01,
       -1.28795624e-01, -1.53575122e-01,  8.93909454e-01, -8.20058864e-03,
       -4.28725258e-02,  4.00118649e-01,  2.45825529e-01,  2.26167381e-01,
       -2.06156537e-01, -2.72675246e-01,  3.51542383e-01, -2.29918420e-01,
        2.90648311e-01,  2.27378681e-01,  2.08929658e-01,  5.61467171e-01,
       -2.72135913e-01,  5.84309042e-01, -2.66152173e-01, -1.00609839e-01,
        1.81704164e+00, -3.37376408e-02,  1.78443775e-01, -2.87772834e-01,
       -2.79234932e-03,  1.76950246e-01,  2.23327912e-02, -4.91233356e-02,
       -2.79246829e-02,  3.46576333e-01, -1.18681419e+00,  1.29489815e-02,
        5.77934265e-01,  1.92006111e-01, -1.98998272e-01, -5.76355718e-02,
       -2.13405281e-01,  2.11325482e-01,  4.40747023e-01,  7.27713257e-02,
       -4.97071683e-01, -

In [14]:
# serch in vectore store 
def search_code(query, index, chunks, top_k=5):
    # Embed the query
    query_embedding = embed_query(query)

    # Search for the nearest neighbors
    distances, indices = index.search(np.array([query_embedding]), top_k)
    
    # Retrieve the top K code snippets
    results = [chunks[i] for i in indices[0]]
    
    return results

: 

In [None]:
# Example of how to use the search function
user_query = "function to find circle area"
top_results = search_code(user_query, vectors[0], vectors[1], top_k=3)

In [None]:
# # The `top_results` will contain the closest matching code snippets to the query
# for idx, result in enumerate(top_results):
#     print(f"Result {idx+1}:")
#     print(result)
#     print("\n")

#to get from repo and convert to txt

In [None]:
def main():
    load_dotenv()
    st.set_page_config(page_title="SWOC BOT", page_icon='icon.jpg')
    
    if 'conversation' not in st.session_state:
        st.session_state.conversation = None

    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = None

    st.write(css, unsafe_allow_html=True)

    st.header("Chat with PDF :books:", anchor="center")
    
    # Input for user questions
    user_question = st.text_input("Ask a question about a PDF file: ")
    if user_question:
        handle_userinput(user_question)

    with st.sidebar:
        st.subheader("Enter Repository Details")

        # Inputs for repository name and project name
        project_name = st.text_input("Project Name")
        repo_name = st.text_input("Repository Name")
        
        st.subheader("Your PDFs")
        pdfs = st.file_uploader("Upload PDFs", type=['pdf'], accept_multiple_files=True)

        # Button to trigger processing
        if st.button("Process"):
            if project_name and repo_name:
                with st.spinner("Processing..."):
                    # Get code from the Azure DevOps repository using user-provided project and repo names
                    get_code_from_repo(repo_name, project_name)

                    # Get the code text from the output .txt files
                    raw_text = get_txt_text("output_files")

                    # Split the raw text into chunks
                    chunks = text_to_chunks(raw_text)
                   
                    # Create a vector store using the code chunks
                    vectorstore = get_vectorstore(chunks)

                    # Create conversation chain and store it in session state
                    st.session_state.conversation = get_conversation_chain(vectorstore)
            else:
                st.warning("Please provide both the project name and repository name.")

In [None]:
def get_txt_text(txt_files_dir):
    text = ""  # Initialize an empty string to collect all the text

    # Traverse through all the files in the given directory
    for file_name in os.listdir(txt_files_dir):
        # Only process .txt files
        if file_name.endswith('.txt'):
            file_path = os.path.join(txt_files_dir, file_name)
            
            # Open and read the file content
            with open(file_path, 'r', encoding='utf-8') as f:
                file_content = f.read()
                text += file_content  # Append the file content to the main text variable
                
                print(f"Reading file: {file_name}")  # Optional debug print

    return text