In [2]:
import json
import pandas as pd
from master_nl2sql import *
from helper import extract_json_content

In [3]:
df = pd.read_csv('Loan.csv')

In [4]:
df.head()

Unnamed: 0,Loan_ID,Loan_Amount,Credit_Score,Income,Employment_Type,Loan_Term,Past_Defaults,Loan_Status
0,1,200000,750,40000,Salaried,10,0,1
1,2,500000,680,60000,Self-employed,15,1,0
2,3,250000,720,45000,Salaried,10,0,1
3,4,300000,650,30000,Salaried,20,1,0
4,5,100000,600,25000,Self-employed,5,2,0


In [5]:
df.columns

Index(['Loan_ID', 'Loan_Amount', 'Credit_Score', 'Income', 'Employment_Type',
       'Loan_Term', 'Past_Defaults', 'Loan_Status'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Loan_ID          200 non-null    int64 
 1   Loan_Amount      200 non-null    int64 
 2   Credit_Score     200 non-null    int64 
 3   Income           200 non-null    int64 
 4   Employment_Type  200 non-null    object
 5   Loan_Term        200 non-null    int64 
 6   Past_Defaults    200 non-null    int64 
 7   Loan_Status      200 non-null    int64 
dtypes: int64(7), object(1)
memory usage: 12.6+ KB


In [7]:
df.describe()

Unnamed: 0,Loan_ID,Loan_Amount,Credit_Score,Income,Loan_Term,Past_Defaults,Loan_Status
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,100.5,258900.0,691.85,53575.0,13.46,0.25,0.79
std,57.879185,83724.782448,29.992922,13068.98492,5.309364,0.518502,0.40833
min,1.0,100000.0,600.0,25000.0,5.0,0.0,0.0
25%,50.75,200000.0,677.5,45000.0,10.0,0.0,1.0
50%,100.5,250000.0,690.0,55000.0,15.0,0.0,1.0
75%,150.25,300000.0,710.0,60000.0,20.0,0.0,1.0
max,200.0,500000.0,780.0,90000.0,25.0,2.0,1.0


In [8]:
df['Employment_Type'].unique()

array(['Salaried', 'Self-employed'], dtype=object)

In [9]:
dialect="MySQL" # (MySQL/PostgreSQL/SQLite)

# Schema of Tables for LLM context
schema_context="""
CREATE TABLE Employee (
    Loan_ID INTEGER PRIMARY KEY,
    Loan_Amount INTEGER NOT NULL,
    Credit_Score INTEGER NOT NULL,
    Income INTEGER NOT NULL,
    Employment_Type VARCHAR(30) CHECK (Employment_Type IN ('Salaried', 'Self-employed'))
    Loan_Term INTEGER NOT NULL
    Past_Defaults INTEGER NOT NULL
    Loan_Status TINYINT(1) NOT NULL CHECK (Loan_Status IN (0, 1))
"""

In [10]:
system_prompt = get_system_prompt(dialect,schema_context)

messages=[
        {"role": "system", "content": system_prompt},  # System role (optional)
        {"role": "user", "content": ""}  # User prompt
    ]

def get_me_sql_query(user_query, print_results=False):
    messages[-1]["content"]=user_query
    response = get_me_llm_response(messages)
    if print_results:
        print(f"LLM response : {response}")
    # response = json.loads(response)
    response = extract_json_content(response)
    response = json.loads(response, strict=False)
    if print_results:
        print("="*20,"\n")
        print("User Query: ",user_query,"\n")
        print("SQL Query :\n",response["sql_query"])
        print("="*20,"\n")
    return response

In [11]:
ans = get_me_sql_query("how many data is present in table?")

print(ans["sql_query"])

SELECT COUNT(*) AS Total_Rows FROM Employee


In [12]:
ans = get_me_sql_query("how many employees are salaried?")
print(ans["sql_query"])

SELECT COUNT(*) AS Total_Salaried_Employees FROM Employee WHERE Employment_Type = 'Salaried'


In [13]:
ans = get_me_sql_query("Give me minimum, average and maximum credit score")
print(ans["sql_query"])

SELECT MIN(Credit_Score) AS Min_Credit_Score, AVG(Credit_Score) AS Avg_Credit_Score, MAX(Credit_Score) AS Max_Credit_Score FROM Employee


In [14]:
ans = get_me_sql_query("Give me total number of employee who's credit score is more than average")
print(ans["sql_query"])

WITH AverageCreditScore AS (SELECT AVG(Credit_Score) AS AvgScore FROM Employee) SELECT COUNT(*) AS TotalEmployees FROM Employee, AverageCreditScore WHERE Employee.Credit_Score > AverageCreditScore.AvgScore
