In [1]:
import os
import ast
from dotenv import load_dotenv
from langchain_qdrant import QdrantVectorStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings,ChatGoogleGenerativeAI
from langchain_community.document_loaders import GithubFileLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter,Language

In [2]:
load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
access_token = os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")

In [4]:
loader = GithubFileLoader(
    repo = "rishabhpancholi/fake-job-detection-mlops-project",
    branch = "main",
    access_token = access_token,
    github_api_url = "https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(
        (".py",".yml",".yaml","Dockerfile","md")
    )
)

In [5]:
documents = loader.load()

In [6]:
documents[0].page_content

'# Github Actions Workflow\nname: MLOPS CICD\n\n# When this workflow should run\non:\n  push:\n    branches:\n      - main # Runs the workflow whenever code is pushed to the main branch\n\njobs:\n  mlops_workflow:\n    runs-on: ubuntu-latest\n    environment: production\n\n    env:\n      MODEL_NAME: ${{ secrets.MODEL_NAME }}\n      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}\n      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n      MLFLOW_TRACKING_USERNAME: ${{ secrets.MLFLOW_TRACKING_USERNAME }}\n      MLFLOW_TRACKING_PASSWORD: ${{ secrets.MLFLOW_TRACKING_PASSWORD }}\n      MODEL_TRAINER_REPO_OWNER: ${{ secrets.MODEL_TRAINER_REPO_OWNER }}\n      MODEL_TRAINER_REPO_NAME: ${{ secrets.MODEL_TRAINER_REPO_NAME }}\n      DVC_REMOTE_ACCESS_KEY_ID: ${{ secrets.DVC_REMOTE_ACCESS_KEY_ID }}\n      DVC_REMOTE_SECRET_ACCESS_KEY: ${{ secrets.DVC_REMOTE_SECRET_ACCESS_KEY }}\n\n    steps: # Steps that run inside this job\n      # Step 1: Checkout the repository to the runner\

In [7]:
def file_type_tagger(documents):
    for doc in documents:
        if doc.metadata["path"].endswith(".yml"):
            doc.metadata["file_type"] = "yaml"
        else:
            doc.metadata["file_type"] = doc.metadata["path"].split(".")[-1]
        doc.metadata["file_name"] = doc.metadata["path"].split("/")[-1].split(".")[0]
    return documents

documents = file_type_tagger(documents)

In [8]:
def python_code_parser(code):
        tree = ast.parse(code)

        functions = []
        classes = []

        for node in ast.walk(tree):
           if isinstance(node, ast.FunctionDef):
               name_node = node.name
               functions.append(name_node)
           elif isinstance(node, ast.ClassDef):
               name_node = node.name
               classes.append(name_node)

        return functions,classes

In [9]:
for doc in documents:
    if doc.metadata["file_type"] == "py":
        functions,classes = python_code_parser(doc.page_content)
        doc.metadata["functions"] = functions
        doc.metadata["classes"] = classes
    else:
        doc.metadata["functions"] = []
        doc.metadata["classes"] = []

In [10]:
documents[32].metadata

{'path': 'src/stages/data_transformation.py',
 'sha': 'b08731404b1cb326434356064b02c9c30ee4dd13',
 'source': 'https://api.github.com/rishabhpancholi/fake-job-detection-mlops-project/blob/main/src/stages/data_transformation.py',
 'file_type': 'py',
 'file_name': 'data_transformation',
 'functions': ['__init__',
  'lemmatize_text',
  'clean_data',
  'impute_data',
  'transform'],
 'classes': ['DataTransformation']}

In [11]:
python_text_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.PYTHON,
    chunk_size = 1000,
    chunk_overlap = 200
)

markdown_text_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.MARKDOWN,
    chunk_size = 1000,
    chunk_overlap = 200
)

In [12]:
python_chunks = python_text_splitter.split_documents([doc for doc in documents if doc.metadata["file_type"] == "py"])
markdown_chunks = markdown_text_splitter.split_documents([doc for doc in documents if doc.metadata["file_type"]!="py"])

In [13]:
python_chunks[4].page_content

'# General Imports\nfrom fastapi import FastAPI\nfrom fastapi.middleware.cors import CORSMiddleware\n\n# Package Imports\nfrom api.routes import home_router,prediction_router\nfrom api.exception import register_exception_handlers\n\n# Initialize FastAPI\napp = FastAPI(\n    title="Real/Fake Job Detection API",\n    description="API for detecting real or fake job postings using NLP and ML models",\n)\n\n# Adding Middlewares\napp.add_middleware(\n    CORSMiddleware,\n    allow_origins=["*"],\n    allow_credentials=True,\n    allow_methods=["*"],\n    allow_headers=["*"],\n)\n\n# Register Exception Handlers\nregister_exception_handlers(app)\n\n# Include Routers\napp.include_router(home_router)\napp.include_router(prediction_router)'

In [14]:
markdown_chunks[4].page_content

'# Step 8:\n      - name: Deploy to EC2 instance\n        uses: appleboy/ssh-action@v1.0.3\n        env:\n          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}\n          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n          AWS_REGION: ap-south-1\n        with:\n          host: ${{ secrets.EC2_HOST }}\n          username: ubuntu\n          key: ${{ secrets.EC2_SSH_KEY }}\n          script: |\n            echo "Logging to ECR"\n            aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/j4r9x7x2\n            echo "Stopping old container"\n            docker-compose down\n            echo "Removing old image"\n            docker image prune -af\n            echo "Running new container"\n            docker-compose up -d'

In [15]:
url = "http://localhost:6333/"

In [16]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [17]:
total_chunks = python_chunks
total_chunks.extend(markdown_chunks)

In [18]:
vectorstore = QdrantVectorStore.from_documents(
    total_chunks,
    embeddings,
    collection_name="codebase_embeddings",
)

In [19]:
retriever = vectorstore.as_retriever(search_kwargs = {"k":5})

In [20]:
query = "Which methods are implemented in my model trainer class?"
docs = retriever.invoke(query)

In [21]:
docs

[Document(metadata={'path': 'src/stages/model_trainer.py', 'sha': '57ce1bd7e47f68ad94d96d9469657aa2d8d5280d', 'source': 'https://api.github.com/rishabhpancholi/fake-job-detection-mlops-project/blob/main/src/stages/model_trainer.py', 'file_type': 'py', 'file_name': 'model_trainer', 'functions': ['__init__', 'build_model_pipeline', 'train_model', 'log_experiment', 'train'], 'classes': ['ModelTrainer'], '_id': '1512e0d8-4c8d-49ae-be5e-e6d0f6d0a376', '_collection_name': 'codebase_embeddings'}, page_content='class ModelTrainer:\n    """Class for model trainer stage"""\n    def __init__(self, logger = get_logger(\'model_trainer\'), utils = Utility()):\n        """Initializes the model trainer class"""\n        try:\n            self.transformed_train_file_path = Path(ARTIFACTS_DIR)/DATA_TRANSFORMATION_DIR_NAME/DATA_TRANSFORMATION_TRANSFORMED_TRAIN_FILE_NAME\n            self.transformed_test_file_path = Path(ARTIFACTS_DIR)/DATA_TRANSFORMATION_DIR_NAME/DATA_TRANSFORMATION_TRANSFORMED_TEST_FIL

In [122]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

In [None]:
class ContextualCompressionRetriever:
    def __init__(self,base_retriever,compressor,k):
        self.base_retriever = base_retriever
        self.compressor = compressor
        self.k = k

    def invoke(self,query):
        similar_docs = self.base_retriever.invoke(query)
        context = ""
        for doc in similar_docs:
            context += f"""
             Document page content: {doc.page_content}
             Document metadata: {doc.metadata}
            """
        prompt = f"""
        You are a specialised contextual compression retriever.
        Here is the user's query: {query}
        For this query this is the context of every retrieved document along with its metadata:
        {context}

        I want you to select only {self.k} of them (apply it very strictly) which are most relevant to the users query and return and combine them into one string and return it.

        Also mention the source of the files from the metadata(do it strictly).
        """

        result = self.compressor.invoke(prompt)
        return result
        

In [127]:
compression_retriever = ContextualCompressionRetriever(retriever,llm,3)

In [129]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

In [None]:
query = "How is the continous deployment working in github actions?"

context = compression_retriever.invoke(query).content

prompt = f""" You are a codebase agent who answers queries regarding certain piece of code. For Eg, How it works and if someone asks about a particular class or function in a file you skillfully handle it and tell it accordingly like an
              Excellent teacher.
           Here is the entire code context for your reference:
           {context}
           Now based on this code context this is the user's query:
           {query}
           Please generate a very good answer for the user.

           Please just tell your answer along with the code snippets without telling which document you are referring to (Apply this rule very strictly).
           After each answer tell the source provided in the context (again very strict rule).
           """

response = llm.invoke(prompt).content

response

"The continuous deployment process in GitHub Actions is a streamlined, automated workflow designed to ensure that the latest code changes are continuously integrated and deployed. Here's a breakdown of how it works:\n\n1.  **Triggering the Workflow**: The entire process kicks off automatically whenever new code is pushed or merged into the `main` branch. This ensures that every significant update to the main development line initiates a deployment cycle.\n\n2.  **Artifact Preparation**: Once triggered, the system first logs into Amazon ECR (Elastic Container Registry). It then proceeds to build a Docker image based on the project's codebase. This newly built image, which encapsulates the application and its dependencies, is subsequently pushed to the ECR repository. It's specifically tagged as `public.ecr.aws/j4r9x7x2/rishabhpancholi/real-fake-job-detection-api:latest`, making it readily available for deployment.\n\n3.  **Deployment to EC2**: With the Docker image prepared and stored i