In [1]:
import json
import pandas as pd
import openai
import os
from loguru import logger
import psycopg2
from psycopg2.extras import execute_values
import numpy as np
from pgvector.psycopg2 import register_vector
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Database connection configuration
DB_CONFIG = {
    'dbname': os.getenv("DB_NAME", "postgres"),
    'user': os.getenv("DB_USER", "postgres"),
    'password': os.getenv("DB_PASSWORD", "mypass"),
    'host': os.getenv("DB_HOST", "localhost"),
    'port': os.getenv("DB_PORT", "5432"),
}

# Constants for OpenAI
OPENAI_MODEL = "text-embedding-ada-002"

def read_json_file(file_path):
    """Reads JSON data from a file and returns it as a Python dictionary."""
    try:
        with open(file_path, 'r', encoding='utf-8') as json_file:
            return json.load(json_file)
    except FileNotFoundError:
        logger.error(f"The file {file_path} was not found.")
    except json.JSONDecodeError:
        logger.error(f"The file {file_path} contains invalid JSON.")
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
    return None

def get_embeddings(text):
    """Retrieves embeddings for the given text using OpenAI's API."""
    response = openai.Embedding.create(model=OPENAI_MODEL, input=text.replace("\n", " "))
    return response['data'][0]['embedding']

def transform_data(df):
    """Transforms the DataFrame by joining specific columns' lists into strings."""
    for column in ['table_of_content', 'practice_exercises']:
        df[column] = df[column].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
    return df

def create_embeddings_table(cursor):
    """Creates a PostgreSQL table to store embeddings and metadata."""
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS embeddings (
        id bigserial primary key,
        local_path text,
        name text,
        table_of_content text,
        practice_exercises text,
        github_link text,
        embedding vector(1536)
    );
    """)

def batch_insert_embeddings(connection, df):
    """Performs batch insertion of embeddings and metadata into PostgreSQL."""
    with connection.cursor() as cursor:
        register_vector(connection)
        data_list = [
            (row['local_path'], row['name'], row['table_of_content'], row['practice_exercises'], row['github_link'], np.array(get_embeddings(row['table_of_content'])))
            for _, row in df.iterrows()
        ]
        execute_values(cursor, """
        INSERT INTO embeddings (local_path, name, table_of_content, practice_exercises, github_link, embedding)
        VALUES %s
        """, data_list)
        connection.commit()

def etl(file_path):
    """Extract, Transform, Load process for embedding data."""
    # Extract
    data = read_json_file(file_path)
    if data is None:
        return
    
    df = pd.DataFrame(data)
    logger.info(f"DataFrame shape: {df.shape}")

    # Transform
    df = transform_data(df)
    logger.info(f"Shape Transformed Data: {df.shape}")

    # Load
    try:
        with psycopg2.connect(**DB_CONFIG) as connection:
            create_embeddings_table(connection.cursor())
            batch_insert_embeddings(connection, df)
            logger.info("Data loaded successfully into the database.")
    except psycopg2.Error as e:
        logger.error(f"Database error: {e}")




In [2]:
# You can call the etl function with the appropriate JSON file path
etl("/mnt/c/Users/user/OneDrive/Desktop/github-projects/advanced-github-search/notebooks/notebooks_summary.json")

[32m2024-10-19 18:01:41.787[0m | [1mINFO    [0m | [36m__main__[0m:[36metl[0m:[36m87[0m - [1mDataFrame shape: (95, 5)[0m
[32m2024-10-19 18:01:41.790[0m | [1mINFO    [0m | [36m__main__[0m:[36metl[0m:[36m91[0m - [1mShape Transformed Data: (95, 5)[0m
[32m2024-10-19 18:02:31.077[0m | [1mINFO    [0m | [36m__main__[0m:[36metl[0m:[36m98[0m - [1mData loaded successfully into the database.[0m


## Imports

In [3]:
import json
import pandas as pd

import openai
import os
from loguru import logger


import psycopg2
from pgvector.psycopg2 import register_vector
import numpy as np
from psycopg2.extras import execute_values
import numpy as np

from dotenv import load_dotenv
load_dotenv()

True

## Read Data

In [3]:
def read_json_file(file_path):
    """
    Reads JSON data from a file and returns it as a Python dictionary.

    :param file_path: Path to the JSON file.
    :return: Parsed JSON data as a dictionary or list.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except json.JSONDecodeError:
        print(f"Error: The file {file_path} contains invalid JSON.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [3]:
file_path = "notebooks_summary.json"

In [4]:
#Helper function: get embeddings for a text
def get_embeddings(text):
   response = openai.Embedding.create(
       model="text-embedding-ada-002",
       input = text.replace("\n"," ")
   )
   embedding = response['data'][0]['embedding']
   return embedding

In [None]:
def etl(file_fath):
    # Extract
    data = read_json_file(file_fath)
    df = pd.DataFrame(data)
    logger.info(f"{df.shape}")
    
    # Transform 
    df['table_of_content'] = df['table_of_content'].apply(lambda x: " ".join(x))
    df['practice_exercises'] = df['practice_exercises'].apply(lambda x: " ".join(x))

    # Load
    connection = psycopg2.connect(
        dbname="postgres",
        user="postgres",
        password="mypass",
        host="localhost",  # or the database server IP address
        port="5432"        # default port for PostgreSQL
    )
    
    cur = connection.cursor()
    #install pgvector
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
    connection.commit()
    
    
    register_vector(connection)
    
    # Create table to store embeddings and metadata
    table_create_command = """
    CREATE TABLE embeddings (
                id bigserial primary key, 
                local_path text,
                name text,
                table_of_content text,
                practice_exercises text,
                github_link text,
                embedding vector(1536)
                );
                """
    cur.execute(table_create_command)
    cur.close()
    connection.commit()
    
    #Batch insert embeddings and metadata from dataframe into PostgreSQL database
    register_vector(connection)
    cur = connection.cursor()
    # Prepare the list of tuples to insert
    data_list = [(row['local_path'], row['name'], row['table_of_content'], row['practice_exercises'], row['github_link'], np.array(row['embeddings'])) for index, row in df.iterrows()]
    # Use execute_values to perform batch insertion
    execute_values(cur, "INSERT INTO embeddings (local_path, name, table_of_content, practice_exercises, github_link, embedding) VALUES %s", data_list)
    # Commit after we insert all embeddings
    connection.commit()
        
    


## User Query Similarity Search

In [24]:
import numpy as np

In [6]:
#Helper function: Get top 3 most similar documents from the database
def get_top3_similar_docs(query_embedding, connection):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(connection)
    cur = connection.cursor()
    # Get the top 3 most similar documents using the KNN <=> operator
    cur.execute("SELECT table_of_content, name, practice_exercises, github_link FROM embeddings ORDER BY embedding <=> %s LIMIT 3", (embedding_array,))
    top3_docs = cur.fetchall()
    return top3_docs

In [7]:
# Helper function: get text completion from OpenAI API
# Note we're using the latest gpt-3.5-turbo-0613 model
def get_completion_from_messages(messages, model="gpt-4o", temperature=0, max_tokens=1000):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens, 
    )
    return response.choices[0].message["content"]

In [8]:
# Helper function: get embeddings for a text
def get_embeddings(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input = text.replace("\n"," ")
    )
    embedding = response['data'][0]['embedding']
    return embedding

In [9]:
import os
import openai
import psycopg2

In [10]:
DB_CONFIG = {
    'dbname': os.getenv("DB_NAME", "postgres"),
    'user': os.getenv("DB_USER", "postgres"),
    'password': os.getenv("DB_PASSWORD", "mypass"),
    'host': os.getenv("DB_HOST", "localhost"),
    'port': os.getenv("DB_PORT", "5432"),
}

connection = psycopg2.connect(**DB_CONFIG)

In [11]:
related_docs = get_top3_similar_docs(get_embeddings("Best Practices for Function Arguments in Python"), connection)

In [12]:
related_docs

[('Positional Arguments Keyword Arguments Default Parameters Best Practices for Function Arguments in Python Positional Arguments Keyword Arguments Default Parameters Practice Exercise: A Day at the Zoo Solution',
  '03 Argument Passing',
  '## <a id=\'toc5_\'></a> ## <a id=\'toc5_\'></a> You are planning a visit to the local zoo with a group of students. The zoo has several sections for different kinds of animals, and each section has feeding times, special shows, and educational talks. To maximize the visit, you decide to write a Python program that helps organize the day\'s activities based on the group\'s preferences. *Tasks:** 1. Write a function named `schedule_visit` that takes three parameters: `section` (the section of the zoo to visit, e.g., "Reptiles", "Birds"), `time` (the time you plan to visit that section), and `activity` with a default value of "Feeding". The function should print a message summarizing the visit plan for that section. 2. Call the `schedule_visit` functi

In [31]:
result = process_input_with_retrieval("approach for Function Arguments in Python")

In [32]:
from pprint import pprint

In [33]:
print(result)

In Python, function arguments can be categorized into several types, each serving a specific purpose. Here's a concise overview of the different types of function arguments and best practices for using them:

1. **Positional Arguments**: These are the most common type of arguments. They are passed to a function in the order in which they are defined. For example:
   ```python
   def greet(name, age):
       print(f"Hello, {name}! You are {age} years old.")
   greet("Alice", 30)
   ```

2. **Keyword Arguments**: These arguments are passed to a function by explicitly specifying the parameter name, allowing you to skip the order. For example:
   ```python
   greet(age=30, name="Alice")
   ```

3. **Default Parameters**: These parameters have default values specified in the function definition. If no argument is provided for a default parameter, the default value is used. For example:
   ```python
   def greet(name, age=25):
       print(f"Hello, {name}! You are {age} years old.")
   greet