# Importing libraries

In [75]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [76]:
pd.set_option('display.max_columns', None)

# Loading data

In [77]:
df = pd.read_csv('Masterfile.csv')

# First look at the data

In [78]:
df.head()

Unnamed: 0.1,Unnamed: 0,FILENAME,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,SORT,SORT COUNT,FILTER,FILTER COUNT,HS JOIN,HS JOIN COUNT,TEMP,TEMP COUNT,GRPBY,GRPBY COUNT,UNIQUE,UNIQUE COUNT,UNION,UNION COUNT,NHJOIN,NHJOIN COUNT,Queries,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",130320.0,62824.0
1,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62824.0,62824.0
2,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.005288,1,13134630.0,9,,,,,71.829424,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62816.0,62812.0
3,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62812.0,62813.0
4,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,1.6e-05,1,13050370.0,9,,,,,96.419862,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",42875.0,42829.0


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        2335 non-null   int64  
 1   FILENAME          2335 non-null   object 
 2   TQ                2335 non-null   float64
 3   TQ COUNT          2335 non-null   int64  
 4   TBSCAN            2335 non-null   float64
 5   TBSCAN COUNT      2335 non-null   int64  
 6   SORT              0 non-null      float64
 7   SORT COUNT        0 non-null      float64
 8   FILTER            0 non-null      float64
 9   FILTER COUNT      0 non-null      float64
 10  HS JOIN           2335 non-null   float64
 11  HS JOIN COUNT     2335 non-null   int64  
 12  TEMP              0 non-null      float64
 13  TEMP COUNT        0 non-null      float64
 14  GRPBY             0 non-null      float64
 15  GRPBY COUNT       0 non-null      float64
 16  UNIQUE            0 non-null      float64


## Dropping non-feature columns

In [80]:
df.columns

Index(['Unnamed: 0', 'FILENAME', 'TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT',
       'SORT', 'SORT COUNT', 'FILTER', 'FILTER COUNT', 'HS JOIN',
       'HS JOIN COUNT', 'TEMP', 'TEMP COUNT', 'GRPBY', 'GRPBY COUNT', 'UNIQUE',
       'UNIQUE COUNT', 'UNION', 'UNION COUNT', 'NHJOIN', 'NHJOIN COUNT',
       'Queries', 'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'],
      dtype='object')

In [81]:
cols_keep = ['Queries', 'ESTIMATED_MEMORY', 'ACTUAL_MEMORY']
df = df[cols_keep]

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Queries           2335 non-null   object 
 1   ESTIMATED_MEMORY  2332 non-null   float64
 2   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(2), object(1)
memory usage: 54.9+ KB


# Checking Missing Values

In [83]:
df.isnull().sum()

Queries             0
ESTIMATED_MEMORY    3
ACTUAL_MEMORY       3
dtype: int64

## Dropping columns with all missing values

In [84]:
df.dropna(axis=1, how='all', inplace=True)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Queries           2335 non-null   object 
 1   ESTIMATED_MEMORY  2332 non-null   float64
 2   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(2), object(1)
memory usage: 54.9+ KB


In [86]:
df.isnull().sum()

Queries             0
ESTIMATED_MEMORY    3
ACTUAL_MEMORY       3
dtype: int64

## Dropping rows with missing values

In [87]:
df.dropna(axis=0, inplace=True)

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2332 entries, 0 to 2334
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Queries           2332 non-null   object 
 1   ESTIMATED_MEMORY  2332 non-null   float64
 2   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(2), object(1)
memory usage: 72.9+ KB


In [89]:
df.head()

Unnamed: 0,Queries,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,"SELECT cn.name AS company_name, lt.link AS li...",130320.0,62824.0
1,"SELECT cn.name AS company_name, lt.link AS li...",62824.0,62824.0
2,"SELECT cn.name AS company_name, lt.link AS li...",62816.0,62812.0
3,"SELECT cn.name AS company_name, lt.link AS li...",62812.0,62813.0
4,"SELECT cn.name AS company_name, lt.link AS li...",42875.0,42829.0


In [90]:
df.shape

(2332, 3)

In [91]:
df.columns
# Instead of 'ESTIMATED_SORT_SHRHEAP_TOP' and 'SORT_SHRHEAP_TOP', 
# here there is 'ESTIMATED_MEMORY'and 'ACTUAL_MEMORY'

Index(['Queries', 'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'], dtype='object')

# Converting these values to MB

In [92]:
df['ESTIMATED_MEMORY'] = df['ESTIMATED_MEMORY'] * 4000 / 1000000
df['ACTUAL_MEMORY'] = df['ACTUAL_MEMORY'] * 4000 / 1000000

In [93]:
df.head()

Unnamed: 0,Queries,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,"SELECT cn.name AS company_name, lt.link AS li...",521.28,251.296
1,"SELECT cn.name AS company_name, lt.link AS li...",251.296,251.296
2,"SELECT cn.name AS company_name, lt.link AS li...",251.264,251.248
3,"SELECT cn.name AS company_name, lt.link AS li...",251.248,251.252
4,"SELECT cn.name AS company_name, lt.link AS li...",171.5,171.316


In [94]:
df.columns

Index(['Queries', 'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'], dtype='object')

In [95]:
df.rename(columns={'Queries':'sql', 'ESTIMATED_MEMORY':'db2', 'ACTUAL_MEMORY':'actual'}, inplace=True)

In [96]:
df.to_csv('job2-clean.csv', index=False)

In [99]:
df.head()

Unnamed: 0,sql,db2,actual
0,"SELECT cn.name AS company_name, lt.link AS li...",521.28,251.296
1,"SELECT cn.name AS company_name, lt.link AS li...",251.296,251.296
2,"SELECT cn.name AS company_name, lt.link AS li...",251.264,251.248
3,"SELECT cn.name AS company_name, lt.link AS li...",251.248,251.252
4,"SELECT cn.name AS company_name, lt.link AS li...",171.5,171.316


# Generating embeddings of the SQL text

In [103]:
from sentence_transformers import SentenceTransformer

# Specify a custom cache directory
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder='./custom_cache')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [104]:
df.head()

Unnamed: 0,sql,db2,actual
0,"SELECT cn.name AS company_name, lt.link AS li...",521.28,251.296
1,"SELECT cn.name AS company_name, lt.link AS li...",251.296,251.296
2,"SELECT cn.name AS company_name, lt.link AS li...",251.264,251.248
3,"SELECT cn.name AS company_name, lt.link AS li...",251.248,251.252
4,"SELECT cn.name AS company_name, lt.link AS li...",171.5,171.316


In [105]:
import numpy as np

batch_size = 64
embeddings = []

for i in range(0, len(df), batch_size):
    batch = df['sql'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch)
    embeddings.extend(batch_embeddings)

# Add embeddings to the DataFrame
df['sql_embedding'] = embeddings

In [106]:
df.columns

Index(['sql', 'db2', 'actual', 'sql_embedding'], dtype='object')

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2332 entries, 0 to 2334
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sql            2332 non-null   object 
 1   db2            2332 non-null   float64
 2   actual         2332 non-null   float64
 3   sql_embedding  2332 non-null   object 
dtypes: float64(2), object(2)
memory usage: 91.1+ KB


In [109]:
# Access the embedding of the first row
embedding = df.loc[0, 'sql_embedding']
print(embedding)  # Prints the numpy array


[ 7.47370794e-02 -4.23083119e-02  3.44582312e-02  5.27856313e-02
 -9.65094008e-03  3.59796695e-02  9.71529335e-02 -7.36669172e-03
 -8.44681934e-02  1.72663573e-02  1.09555021e-01 -2.89458525e-03
  1.07089967e-01 -7.46156424e-02 -6.08413592e-02  1.16378158e-01
  6.57423735e-02  8.20209365e-03 -3.63384187e-02 -5.32566011e-02
 -2.76523978e-02  2.08804943e-02  2.79304367e-02 -1.57242250e-02
  4.49437946e-02  2.40794662e-02  1.56186735e-02  4.21851091e-02
 -3.75329680e-03 -5.57355024e-02 -1.22741982e-01  7.12954625e-02
  2.31467746e-03  3.85144837e-02  1.15833171e-01 -7.07320198e-02
 -7.96265230e-02 -2.61578280e-02  4.41403985e-02  1.92825701e-02
 -5.20544418e-04 -4.89756130e-02  7.76908398e-02  1.95369404e-02
  1.50788231e-02  2.82378681e-02 -6.14246633e-03  4.40542549e-02
  2.36075856e-02  2.87716202e-02 -1.28592581e-01 -1.54983737e-02
 -2.45579309e-03 -1.90576084e-03  3.20792682e-02 -6.43602312e-02
 -7.50623494e-02  4.54589091e-02 -1.07965484e-01 -7.68895969e-02
 -1.77532211e-02 -5.04951

In [110]:
# Access the embedding of the first row
embedding = df.loc[0, 'sql_embedding']
print(len(embedding))  # Prints the numpy array


384


# Partitioning the dataset into training and test sets

In [132]:
print(df.columns)

Index(['sql', 'db2', 'actual', 'sql_embedding'], dtype='object')


In [133]:
X_cols = ['sql_embedding', 'db2']
target = ['actual']

In [134]:
df.shape

(2332, 4)

In [135]:
X, y = df[X_cols], df[target]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.2, 
                     random_state=0)

In [136]:
X_train

Unnamed: 0,sql_embedding,db2
812,"[0.059358865, -0.020964436, 0.018205285, 0.063...",526.412
233,"[0.09310951, -0.051648285, 0.04889493, 0.05467...",251.248
1383,"[0.058846015, -0.027138071, 0.016208364, 0.047...",532.976
1755,"[0.060842, -0.03190487, 0.039343238, 0.0623296...",251.248
1337,"[0.053605214, -0.022713669, 0.017548123, 0.065...",526.412
...,...,...
1034,"[0.050058026, -0.018140694, 0.019325035, 0.065...",526.412
1734,"[0.060488556, -0.032311015, 0.041900106, 0.051...",171.500
763,"[0.050186478, -0.020764904, 0.021335857, 0.067...",524.208
835,"[0.08297298, -0.04441836, 0.028338268, 0.05766...",251.252


# Converting y into np arrays

In [137]:
y_train = y_train.values
y_test = y_test.values

In [138]:
X_train_clean = pd.DataFrame(X_train, columns=X_train.columns)
X_train_clean['actual'] = y_train

In [139]:
X_train_clean.head(5)

Unnamed: 0,sql_embedding,db2,actual
812,"[0.059358865, -0.020964436, 0.018205285, 0.063...",526.412,39.804
233,"[0.09310951, -0.051648285, 0.04889493, 0.05467...",251.248,251.248
1383,"[0.058846015, -0.027138071, 0.016208364, 0.047...",532.976,62.98
1755,"[0.060842, -0.03190487, 0.039343238, 0.0623296...",251.248,251.248
1337,"[0.053605214, -0.022713669, 0.017548123, 0.065...",526.412,56.992


In [144]:
#X_train_clean.to_csv('job2_train_clean.csv', index=False)
# Save as JSON
X_train_clean.to_json('job2_train_clean.json', orient='records', lines=True)

In [141]:
X_test_clean = pd.DataFrame(X_test, columns=X_test.columns)
X_test_clean['actual'] = y_test

In [142]:
X_test_clean.head(5)

Unnamed: 0,sql_embedding,db2,actual
1057,"[0.055807058, -0.029653227, 0.02713677, 0.0566...",251.248,251.248
2326,"[0.07027462, -0.027501259, 0.051419735, 0.0595...",251.248,251.252
655,"[0.06710697, -0.027536655, 0.018405523, 0.0605...",526.412,57.124
2104,"[0.074952595, -0.034597382, 0.051836893, 0.063...",251.248,251.248
1404,"[0.06102468, -0.019188361, 0.02127042, 0.06361...",526.412,39.804


In [145]:
# X_test_clean.to_csv('job2_test_clean.csv', index=False)
# Save as JSON
X_test_clean.to_json('job2_test_clean.json', orient='records', lines=True)