In [8]:
import ast 
import re 
from azure.devops.connection import Connection
from msrest.authentication import BasicAuthentication
import os
from transformers import AutoTokenizer, AutoModel
import torch
from langchain_community.vectorstores.faiss import FAISS

In [2]:
from dotenv import load_dotenv
load_dotenv()

#PAT
personal_access_token = os.getenv("AZURE_DEVOPS_PAT")  
organization_url = 'https://dev.azure.com/shwetambhosale18'

# Authenticate using the personal access token
credentials = BasicAuthentication('', personal_access_token)
connection = Connection(base_url=organization_url, creds=credentials)

# Initialize the Git client
git_client = connection.clients.get_git_client()

In [3]:
def get_code_from_repo(repo_name, project_name, file_types=['.py', '.js']):
    repo = git_client.get_repository(project=project_name, repository_id=repo_name)
    
    # Get items (files and directories) from the repository root
    items = git_client.get_items(project=project_name, repository_id=repo.id, recursion_level='Full')

    if not items:
        print("No items found in the repository.")
    
    code = ""
    for item in items:
        if item.is_folder:
            print(f"Skipping folder: {item.path}")
        else:
            print(f"Found file: {item.path}")  # Debug print to show found file paths
            
            if any(item.path.endswith(ext) for ext in file_types):  # Filter by file type
                print(f"Fetching content of: {item.path}")  # Debug print to show file being processed
                
                # Get the content of each item (handle as a generator)
                file_content_generator = git_client.get_blob_content(project=project_name, repository_id=repo.id, sha1=item.object_id)
                
                # Collect content from the generator and decode bytes
                file_content = ''.join([chunk.decode('utf-8') for chunk in file_content_generator])

                if file_content:
                    code += file_content  # Append the content to the `code` string
                else:
                    print(f"No content found for file: {item.path}")  # Debug print if no content is found
    
    return code

In [4]:
repo_name = "SPARK"
project_name = "SPARK"  # Replace with your actual project name

code_df = get_code_from_repo(repo_name, project_name)

print(code_df)

Skipping folder: /
Found file: /README.md
Found file: /app.py
Fetching content of: /app.py
Found file: /index.js
Fetching content of: /index.js
import streamlit as st
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pandas as pd
import pickle

# Load the trained model
model = tf.keras.models.load_model('model.h5')

# Load the encoders and scaler
with open('label_encoder_gender.pkl', 'rb') as file:
    label_encoder_gender = pickle.load(file)

with open('onehot_encoder_geo.pkl', 'rb') as file:
    onehot_encoder_geo = pickle.load(file)

with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)


## streamlit app
st.title('Customer Churn PRediction')

# User input
geography = st.selectbox('Geography', onehot_encoder_geo.categories_[0])
gender = st.selectbox('Gender', label_encoder_gender.classes_)
age = st.slider('Age', 18, 92)
balance = st.number_input('Balance')
credit_score = st.number_input('Cr

In [5]:
from langchain.text_splitter import CharacterTextSplitter
# # Character wise splitting 
def code_to_chunks(code_text):
    text_splitter = CharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=500,
        separator="\n",
        length_function=len
    )
    chunks = text_splitter.split_text(code_text)
    return chunks

In [6]:
code_chunks = code_to_chunks(code_df)
code_chunks

["import streamlit as st\r\nimport numpy as np\r\nimport tensorflow as tf\r\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder\r\nimport pandas as pd\r\nimport pickle\r\n\r\n# Load the trained model\r\nmodel = tf.keras.models.load_model('model.h5')\r\n\r\n# Load the encoders and scaler\r\nwith open('label_encoder_gender.pkl', 'rb') as file:\r\n    label_encoder_gender = pickle.load(file)\r\n\r\nwith open('onehot_encoder_geo.pkl', 'rb') as file:\r\n    onehot_encoder_geo = pickle.load(file)\r\n\r\nwith open('scaler.pkl', 'rb') as file:\r\n    scaler = pickle.load(file)\r\n\r\n\r\n## streamlit app\r\nst.title('Customer Churn PRediction')\r\n\r\n# User input\r\ngeography = st.selectbox('Geography', onehot_encoder_geo.categories_[0])\r\ngender = st.selectbox('Gender', label_encoder_gender.classes_)\r\nage = st.slider('Age', 18, 92)\r\nbalance = st.number_input('Balance')\r\ncredit_score = st.number_input('Credit Score')\r\nestimated_salary = st.number_input('Est

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")



In [9]:
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")



In [10]:
all_embeddings = []

for chunk in code_chunks:
    # Tokenize the chunk
    inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state

    # Average the embeddings to get a fixed-size vector
    mean_embeddings = torch.mean(embeddings, dim=1).squeeze()

    # Store the mean embeddings
    all_embeddings.append(mean_embeddings)

# Convert to tensor or numpy array if needed
all_embeddings = torch.stack(all_embeddings)


In [11]:
embeddings

tensor([[[-7.9890e-02, -7.9689e-02, -5.2989e-02,  ..., -5.9282e-02,
          -5.1422e-01,  5.0983e-01],
         [-6.0591e-01,  1.4039e-01, -1.1477e-03,  ..., -8.0542e-01,
          -4.9629e-01,  1.1576e+00],
         [-4.5256e-01,  5.9092e-01,  5.5482e-01,  ..., -1.0409e+00,
          -5.1213e-01,  1.1935e+00],
         ...,
         [-2.2689e-01, -3.9840e-01,  1.0522e-01,  ..., -3.4415e-01,
          -4.7885e-01,  5.6899e-01],
         [ 7.3044e-02, -1.9225e-01,  8.3394e-02,  ..., -5.1262e-01,
          -5.4995e-01,  6.2564e-01],
         [-8.1032e-02, -7.9339e-02, -5.2885e-02,  ..., -6.0924e-02,
          -5.1473e-01,  5.1066e-01]]])

In [12]:
import faiss  # Make sure to import faiss
import numpy as np

def get_code_vectorstore(chunks):
    all_embeddings = []

    for chunk in chunks:
        # Tokenize the chunk
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the embeddings from the last hidden state
        embeddings = outputs.last_hidden_state

        # Average the embeddings to get a fixed-size vector
        mean_embeddings = torch.mean(embeddings, dim=1).squeeze()

        # Store the mean embeddings
        all_embeddings.append(mean_embeddings)

    # Convert to numpy array
    embeddings = torch.stack(all_embeddings).detach().numpy()  # Convert to numpy array

    # Define the dimension of your embeddings
    embedding_dim = embeddings.shape[1]

    # Create a FAISS index
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance for similarity search

    # Add the embeddings to the index
    index.add(embeddings)

    # Now you can return the index along with the texts
    return index, chunks  # Return the index and the original chunks for reference


In [13]:
vectors = get_code_vectorstore(code_chunks)

In [14]:
vectors

(<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x00000234164CA0A0> >,
 ["import streamlit as st\r\nimport numpy as np\r\nimport tensorflow as tf\r\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder\r\nimport pandas as pd\r\nimport pickle\r\n\r\n# Load the trained model\r\nmodel = tf.keras.models.load_model('model.h5')\r\n\r\n# Load the encoders and scaler\r\nwith open('label_encoder_gender.pkl', 'rb') as file:\r\n    label_encoder_gender = pickle.load(file)\r\n\r\nwith open('onehot_encoder_geo.pkl', 'rb') as file:\r\n    onehot_encoder_geo = pickle.load(file)\r\n\r\nwith open('scaler.pkl', 'rb') as file:\r\n    scaler = pickle.load(file)\r\n\r\n\r\n## streamlit app\r\nst.title('Customer Churn PRediction')\r\n\r\n# User input\r\ngeography = st.selectbox('Geography', onehot_encoder_geo.categories_[0])\r\ngender = st.selectbox('Gender', label_encoder_gender.classes_)\r\nage = st.slider('Age', 18, 92)\r\nbalance = st.