# Importing libraries

In [36]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [37]:
pd.set_option('display.max_columns', None)

# Loading data

In [38]:
df = pd.read_csv('Masterfile.csv')

# First look at the data

In [39]:
df.head()

Unnamed: 0.1,Unnamed: 0,FILENAME,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,SORT,SORT COUNT,FILTER,FILTER COUNT,HS JOIN,HS JOIN COUNT,TEMP,TEMP COUNT,GRPBY,GRPBY COUNT,UNIQUE,UNIQUE COUNT,UNION,UNION COUNT,NHJOIN,NHJOIN COUNT,Queries,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",130320.0,62824.0
1,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62824.0,62824.0
2,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.005288,1,13134630.0,9,,,,,71.829424,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62816.0,62812.0
3,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62812.0,62813.0
4,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,1.6e-05,1,13050370.0,9,,,,,96.419862,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",42875.0,42829.0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        2335 non-null   int64  
 1   FILENAME          2335 non-null   object 
 2   TQ                2335 non-null   float64
 3   TQ COUNT          2335 non-null   int64  
 4   TBSCAN            2335 non-null   float64
 5   TBSCAN COUNT      2335 non-null   int64  
 6   SORT              0 non-null      float64
 7   SORT COUNT        0 non-null      float64
 8   FILTER            0 non-null      float64
 9   FILTER COUNT      0 non-null      float64
 10  HS JOIN           2335 non-null   float64
 11  HS JOIN COUNT     2335 non-null   int64  
 12  TEMP              0 non-null      float64
 13  TEMP COUNT        0 non-null      float64
 14  GRPBY             0 non-null      float64
 15  GRPBY COUNT       0 non-null      float64
 16  UNIQUE            0 non-null      float64


## Dropping non-feature columns

In [41]:
cols_non_features = ['Unnamed: 0', 'FILENAME', 'Queries']
df.drop(columns=cols_non_features, inplace=True)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TQ                2335 non-null   float64
 1   TQ COUNT          2335 non-null   int64  
 2   TBSCAN            2335 non-null   float64
 3   TBSCAN COUNT      2335 non-null   int64  
 4   SORT              0 non-null      float64
 5   SORT COUNT        0 non-null      float64
 6   FILTER            0 non-null      float64
 7   FILTER COUNT      0 non-null      float64
 8   HS JOIN           2335 non-null   float64
 9   HS JOIN COUNT     2335 non-null   int64  
 10  TEMP              0 non-null      float64
 11  TEMP COUNT        0 non-null      float64
 12  GRPBY             0 non-null      float64
 13  GRPBY COUNT       0 non-null      float64
 14  UNIQUE            0 non-null      float64
 15  UNIQUE COUNT      0 non-null      float64
 16  UNION             0 non-null      float64


# Checking Missing Values

In [43]:
df.isnull().sum()

TQ                     0
TQ COUNT               0
TBSCAN                 0
TBSCAN COUNT           0
SORT                2335
SORT COUNT          2335
FILTER              2335
FILTER COUNT        2335
HS JOIN                0
HS JOIN COUNT          0
TEMP                2335
TEMP COUNT          2335
GRPBY               2335
GRPBY COUNT         2335
UNIQUE              2335
UNIQUE COUNT        2335
UNION               2335
UNION COUNT         2335
NHJOIN              2335
NHJOIN COUNT        2335
ESTIMATED_MEMORY       3
ACTUAL_MEMORY          3
dtype: int64

## Dropping columns with all missing values

In [44]:
df.dropna(axis=1, how='all', inplace=True)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TQ                2335 non-null   float64
 1   TQ COUNT          2335 non-null   int64  
 2   TBSCAN            2335 non-null   float64
 3   TBSCAN COUNT      2335 non-null   int64  
 4   HS JOIN           2335 non-null   float64
 5   HS JOIN COUNT     2335 non-null   int64  
 6   ESTIMATED_MEMORY  2332 non-null   float64
 7   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(5), int64(3)
memory usage: 146.1 KB


## Dropping rows with missing values

In [46]:
df.dropna(axis=0, inplace=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2332 entries, 0 to 2334
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TQ                2332 non-null   float64
 1   TQ COUNT          2332 non-null   int64  
 2   TBSCAN            2332 non-null   float64
 3   TBSCAN COUNT      2332 non-null   int64  
 4   HS JOIN           2332 non-null   float64
 5   HS JOIN COUNT     2332 non-null   int64  
 6   ESTIMATED_MEMORY  2332 non-null   float64
 7   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(5), int64(3)
memory usage: 164.0 KB


In [48]:
df.head()

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0.0047,1,13134630.0,9,71.773664,8,130320.0,62824.0
1,0.0047,1,13134630.0,9,71.773664,8,62824.0,62824.0
2,0.005288,1,13134630.0,9,71.829424,8,62816.0,62812.0
3,0.0047,1,13134630.0,9,71.773664,8,62812.0,62813.0
4,1.6e-05,1,13050370.0,9,96.419862,8,42875.0,42829.0


In [49]:
df.shape

(2332, 8)

In [50]:
df.columns
# Instead of 'ESTIMATED_SORT_SHRHEAP_TOP' and 'SORT_SHRHEAP_TOP', 
# here there is 'ESTIMATED_MEMORY'and 'ACTUAL_MEMORY'

Index(['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'],
      dtype='object')

# Converting these values to MB

In [51]:
df['ESTIMATED_MEMORY'] = df['ESTIMATED_MEMORY'] * 4000 / 1000000
df['ACTUAL_MEMORY'] = df['ACTUAL_MEMORY'] * 4000 / 1000000

In [52]:
df.head()

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0.0047,1,13134630.0,9,71.773664,8,521.28,251.296
1,0.0047,1,13134630.0,9,71.773664,8,251.296,251.296
2,0.005288,1,13134630.0,9,71.829424,8,251.264,251.248
3,0.0047,1,13134630.0,9,71.773664,8,251.248,251.252
4,1.6e-05,1,13050370.0,9,96.419862,8,171.5,171.316


In [53]:
df.columns

Index(['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'],
      dtype='object')

In [54]:
df = df.copy()[['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'ESTIMATED_MEMORY', 'ACTUAL_MEMORY']]

In [55]:
df.rename(columns={'ESTIMATED_MEMORY':'db2', 'ACTUAL_MEMORY':'actual'}, inplace=True)

# Partitioning the dataset into training and test sets

In [56]:
print(df.columns)

Index(['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'db2', 'actual'],
      dtype='object')


In [57]:
X_cols = ['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'db2']
target = ['actual']

In [58]:
X, y = df[X_cols], df[target]

In [59]:
features = ['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT']

# Bringing a subset of the features onto the same scale - using ColumnTransformer

In [60]:
c_transf = ColumnTransformer([
        ('stdscaler', StandardScaler(), features)
    ], remainder='passthrough')

X_std = c_transf.fit_transform(X).astype(float)

In [61]:
X_std.shape

(2332, 7)

In [62]:
X.head(1)

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,db2
0,0.0047,1,13134630.0,9,71.773664,8,521.28


In [63]:
X_std[0]

array([ 5.37064699e-02,  0.00000000e+00,  1.02172860e+00,  8.32788493e-01,
       -5.59525618e-02,  8.32788493e-01,  5.21280000e+02])

# Converting y into np arrays

In [64]:
y = y.values

# Saving training and test datasets

In [65]:
X.shape

(2332, 7)

In [66]:
X_std.shape

(2332, 7)

In [67]:
type(X_std)

numpy.ndarray

In [68]:
X_clean = pd.DataFrame(X_std, columns=X.columns)
X_clean['actual'] = y

In [69]:
X_clean.head(5)

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,db2,actual
0,0.053706,0.0,1.021729,0.832788,-0.055953,0.832788,521.28,251.296
1,0.053706,0.0,1.021729,0.832788,-0.055953,0.832788,251.296,251.296
2,0.081201,0.0,1.021729,0.832788,-0.055947,0.832788,251.264,251.248
3,0.053706,0.0,1.021729,0.832788,-0.055953,0.832788,251.248,251.252
4,-0.165491,0.0,0.623686,0.832788,-0.053342,0.832788,171.5,171.316


In [70]:
X_clean.to_csv('job2_scaled_clean.csv', index=False)