In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr
from openai import OpenAI as oai
import ast
from openai import AzureOpenAI
import os
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import SequentialChain
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
import matplotlib.pyplot as plt

In [2]:
class mOpenAI:
    """
    Already setup key and endpoint as environmental variables through bash.
    These can be found on Azure. Currently testing instance `vds-openai-test-001`.
    """
    def __init__(self):
        self.client = AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            api_version="2023-12-01-preview",
            azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT")
        )
        self.deployment_name = 'NUHAI-GPT35'
        

In [3]:
client = mOpenAI().client
deployment_name = mOpenAI().deployment_name

In [4]:
df = pd.read_csv("pl-seminar.csv")

In [5]:
df.head()

Unnamed: 0,id,library,deprecated,updated,prompt,prompt_type,function_type,library_type
0,0,numpy,numpy.array(),numpy.arrangement(),Create a simple Python function to generate an...,easy,most,common
1,1,numpy,numpy.array(),numpy.arrangement(),Write a function that takes in a list of numbe...,easy,most,common
2,2,numpy,numpy.array(),numpy.arrangement(),Write a function that takes a list as input an...,medium,most,common
3,3,numpy,numpy.array(),numpy.arrangement(),Write a function that takes a list of numbers ...,medium,most,common
4,4,numpy,numpy.array(),numpy.arrangement(),Write a function that takes a list of lists as...,hard,most,common


In [6]:
numpy_func = list(set(df[(df['library'] == 'numpy') & (df['function_type'] == 'most')]['deprecated']))


In [7]:
pandas_func = list(set(df[(df['library'] == 'pandas') & (df['function_type'] == 'most')]['deprecated']))


In [8]:
pandas_func


['pandas.DataFrame.to_csv()',
 'pandas.DataFrame.head()',
 'pandas.read_csv()',
 'pandas.DataFrame.describe()',
 'pandas.DataFrame()']

In [9]:
lib = {"pandas": pandas_func, "numpy": numpy_func}

In [10]:
def generate_prompt(f, library):
    return f"generate 100 unique prompts that would make LLMs generate method {f} from package {library}. Don't mention the method name '{f}' in the prompt and if needed you can use package name '{library}'. Make sure the LLMs generate a function. Have one promot per line"

In [11]:
def get_sentences(sentence_string):
    sentences = sentence_string.split('\n')
    # print(sentence_string)
    new_sentences = []
    for s in sentences:
        if s == '':
            continue
        # print(s)
        if(s[0].isdigit()):
            s_ = s[3:]
        else:
            s_ = s
        new_sentences.append(s_)
    return new_sentences

In [15]:
for k in lib:
    for f in lib[k]:
        train_set = pd.read_csv(f"prompts-new/{f}_{k}.csv")
        train_prompts = train_set['prompts']
        prompts = []
        print(k,f)
        model_prompt = generate_prompt(f, k)
        while(len(prompts) < 100):
            print("length of prompts", len(prompts))
            response = client.chat.completions.create(model=deployment_name, messages=[{"role": "user", "content": model_prompt},])
            sentences = get_sentences(response.choices[0].message.content) 
            for s in sentences:
                s = remove_until_first_letter(s)
                if s == '':
                    continue
                if s not in prompts and s not in train_prompts:
                    prompts.append(s)
        df = pd.DataFrame({"prompts": prompts})
        df.to_csv(f"prompts_test/{f}_{k}_test.csv")


pandas pandas.DataFrame.to_csv()
length of prompts 0
pandas pandas.DataFrame.head()
length of prompts 0
length of prompts 63
length of prompts 95
pandas pandas.read_csv()
length of prompts 0
length of prompts 58
pandas pandas.DataFrame.describe()
length of prompts 0
length of prompts 59
pandas pandas.DataFrame()
length of prompts 0
numpy numpy.arange()
length of prompts 0
length of prompts 32
length of prompts 76
numpy numpy.dot()
length of prompts 0
length of prompts 36
length of prompts 68
numpy numpy.std()
length of prompts 0
length of prompts 59
numpy numpy.mean()
length of prompts 0
length of prompts 90
numpy numpy.array()
length of prompts 0


In [12]:
def remove_until_first_letter(s):
    result = ''
    found_first_letter = False
    if isinstance(s, float):
        return ''
        
    for char in s:
        if char.isalpha() and not found_first_letter:
            found_first_letter = True
            result += char
        elif found_first_letter:
            result += char
    return result

In [37]:
for k in lib:
    for f in lib[k]:
        df_ = pd.read_csv(f"prompts/{f}_{k}.csv")
        prompts = list(df_["prompts"])
        print(len(prompts))
        prompts = [remove_until_first_letter(p) for p in prompts]
        prompts = [p for p in prompts if p != '']
        df_new = pd.DataFrame({"prompts":prompts})
        df_new.to_csv(f"prompts-new/{f}_{k}.csv", index=False)

1092
1058
1029
1040
1065
1003
1022
1081
1045
1052
