In [2]:
## Import Library
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
import os
import openai
from dotenv import load_dotenv, find_dotenv
from langchain_experimental.agents.agent_toolkits import create_csv_agent
import glob
import json
import ast
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.llms.openai import OpenAI
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
import spacy


In [3]:
## load the API key
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key  = os.getenv('OPENAI_API_KEY')

## Pass the dataset to agent to get a summary about the dataset
def info_about_dataset(db_id):
    csv_files = glob.glob(f"{db_id}/*.csv")
    csv_file_path = csv_files[0]
    df = pd.read_csv(csv_file_path)
    df_columns = df.columns
    df_columns = df_columns.tolist()

    prompt = """Given a csv file. I want to understand the dataset and provide a brief and consise summary of the data in atleast 100 words"""
    agent = create_csv_agent(
        OpenAI(temperature=0.5, max_tokens=1500), csv_file_path, verbose=False)

    r_summary = agent.run(prompt)
    print("Dataset summary given by agent---------->", r_summary)
    return r_summary,df_columns

## Generate question with Chatgpt API
def question_generator(r_summary, df_columns):
    model ="gpt-3.5-turbo-0613"

    prstr1 = "Create maximum number of UNIQUE suggestion questions that you can generate from the information provided by " + r_summary + "having columns " + str(df_columns) + "make questions short, concise and compact with 3-5 words"
    tmp = """
    Please meet the following conditions.
    ・questions should include all aspects of the dataset
    ・does not include any interrogative words and only main keywords of question
    ・strictly do not include serial numbers in questions
    ・each question is comma seperated
     * Make sure that each question should be related to data and should generate data based meaningful responses
    """
    prstr1 += tmp
    response = openai.ChatCompletion.create(
        model = model,
        temperature= 0.5,
        messages=[
            {"role": "user", "content": prstr1},
        ],
    )
    res1 = response.choices[0]["message"]["content"].strip()
    # res1
    output = res1.split("\n")[1:]
    pattern = r'^\d+\.\s'
    cleaned_list = [re.sub(pattern, '', item) for item in output]
    return cleaned_list

## Use cosine similarity to  reommend similar question
def Recommendation_question(question_list):
    nlp = spacy.load("en_core_web_sm")
    questions = question_list
    processed_questions = [nlp(question) for question in questions]

    similar_questions = {}

    for i in range(len(questions)):
        question = questions[i]
        vector1 = processed_questions[i].vector

        similar_questions[i] = {
            'main_suggestion': question,
            'sub_suggestions': []
        }

        for j in range(len(questions)):
            if i != j:
                vector2 = processed_questions[j].vector
                similarity = cosine_similarity([vector1], [vector2])[0][0]

                if similarity > 0.6:
                    similar_questions[i]['sub_suggestions'].append(questions[j])

        # Ensure there are at least 3 sub-suggestions
        while len(similar_questions[i]['sub_suggestions']) < 3:
            dissimilar_questions = [
                q for q in questions if q != question and q not in similar_questions[i]['sub_suggestions']
            ]
            if dissimilar_questions:
                similar_questions[i]['sub_suggestions'].append(dissimilar_questions.pop())
            else:
                break

    return similar_questions

## Final suggestion and recmmendation function
## Work on CSV and DB file
## For now recommendation works on only CSV dataset
def generate_suggestions(folder_path):
    all_files = glob.glob(os.path.join(folder_path, '*'))

    if not all_files:
        raise FileNotFoundError(f"No files found in directory: {folder_path}")

    # Initialize variables to store results
    results = {}

    # Loop through each file in the folder
    for file_path in all_files:
        # Check if the file is a CSV file
        if file_path.lower().endswith('.csv'):
            try:
                r_summary,df_columns = info_about_dataset(folder_path)
                output = question_generator(r_summary, df_columns)
                similar_question_list = Recommendation_question(output)
                results = {"question_suggestion": similar_question_list}
            except Exception as e:
                print(f"Error processing CSV file {file_path}: {str(e)}")
                

        # Check if the file is a DB file
        elif file_path.lower().endswith('.db'):
            try:
                db = SQLDatabase.from_uri(f"sqlite:///{file_path}")
                toolkit = SQLDatabaseToolkit(db=db, llm=OpenAI(temperature=0.5))
                agent_executor = create_sql_agent(
                    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
                    toolkit=toolkit,
                    verbose=False,
                    agent_type=AgentType.OPENAI_FUNCTIONS
                )
                response = agent_executor.run("You are provided with database schema,read the complete database and suggest top 10-15 unique questions to ask from this schema. Return the output")  # Replace with your SQL query
                questions = response.split('\n')[2:]
                questions = [q.strip(' 1234567890. ') for q in questions if q.strip()]
                similar_question_list = Recommendation_question(questions[:-1])
                results = {"question_suggestion": similar_question_list}
            except Exception as e:
                print(f"Error processing DB file {file_path}: {str(e)}")


    return results

In [4]:
folder_path = r"C:\Users\DELL LATITUDE E5470\Desktop\Suggesations"
generate_suggestions(folder_path)

Error processing CSV file C:\Users\DELL LATITUDE E5470\Desktop\Suggesations\Housing.csv: The model `text-davinci-003` has been deprecated, learn more here: https://platform.openai.com/docs/deprecations


{}