In [25]:
import os
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity

import keyring
openai.api_key =keyring.get_password("system", "openai_key")


def load_directory_contents(directory, recursive=False):
    data = []
    
    if recursive:
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    try:
                        file_contents = f.read()
                        data.append([file, file_contents.strip()])
                    except:
                        pass
    else:
        file_list = os.listdir(directory)
        for file in file_list:
            file_path = os.path.join(directory, file)

            if os.path.isfile(file_path):
                with open(file_path, 'r') as f:
                    try:
                        file_contents = f.read()
                        data.append([file, file_content.strip()])
                    except:
                        pass

    return pd.DataFrame(data, columns=['file_path', 'code'])

directory = '/home/srudloff/github/Some-Notebooks/Android/Beholder/Libs'
df = load_directory_contents(directory, recursive=True)
df

Unnamed: 0,file_path,code
0,MonkeyRunner.py,import socket\nimport subprocess\nimport sys\n...
1,ConfigFile.py,import datetime\nimport decimal\nimport json\n...
2,Beholder.py,import cv2\nimport numpy as np\nfrom matplotli...
3,Vnc.py,import cv2\nimport numpy as np\nfrom python_vn...
4,StateMachine.py,import random\nfrom collections import default...
5,TimeBuddy.py,class TimeBuddy:\n time = 0\n\n def __in...
6,ScoreSort.py,from collections import defaultdict\n\nclass S...
7,Sqler.py,"import sqlite3, os\n\n\nclass Sqler:\n def ..."
8,Adb.py,import subprocess\nimport time\nfrom PIL impor...
9,Beholder-checkpoint.py,import cv2\nimport numpy as np\nfrom matplotli...


In [27]:
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))

In [14]:
df.to_csv('embedded.csv', index=False)


In [4]:
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(code_query, model='text-embedding-ada-002')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values('similarities', ascending=False).head(n)
    return res

In [4]:
res = search_functions(df, 'Completions API tests', n=3)

Unnamed: 0,file_path,file_contents
0,../../README.md,# Welcome to My Jupyter Notebook Repository!\n...
1,../../.gitignore,# Byte-compiled / optimized / DLL files\n__pyc...
2,../../Machine Learning/data/diabetes2.csv,"Pregnancies,Glucose,BloodPressure,SkinThicknes..."
3,../../Machine Learning/data/diabetes1.csv,"Pregnancies,Glucose,BloodPressure,SkinThicknes..."
4,../../Machine Learning/sklearn/simple linear r...,"{\n ""cells"": [\n {\n ""cel..."
...,...,...
407,../../Games/RuneScape/data/2022-10-10.json,"[{""bounty_hunters"": {""rank"": ""-1"", ""count"": ""-..."
408,../../Selenium/Selenium Launch With Tor Proxy....,"{\n ""cells"": [\n {\n ""cell_type"": ""code"",\n..."
409,../../Binary Exploration/Strings_and_hex dig.i...,"{\n ""cells"": [\n {\n ""cel..."
410,../../Binary Exploration/Rip Zip Open.ipynb,"{\n ""cells"": [\n {\n ""cel..."
