# Importing libraries

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

# Loading data

In [3]:
df = pd.read_csv('Masterfile.csv')

# First look at the data

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,FILENAME,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,SORT,SORT COUNT,FILTER,FILTER COUNT,HS JOIN,HS JOIN COUNT,TEMP,TEMP COUNT,GRPBY,GRPBY COUNT,UNIQUE,UNIQUE COUNT,UNION,UNION COUNT,NHJOIN,NHJOIN COUNT,Queries,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",130320.0,62824.0
1,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62824.0,62824.0
2,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.005288,1,13134630.0,9,,,,,71.829424,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62816.0,62812.0
3,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,0.0047,1,13134630.0,9,,,,,71.773664,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",62812.0,62813.0
4,0,/Users/yonisabokar/IdeaProjects/Database_Parse...,1.6e-05,1,13050370.0,9,,,,,96.419862,8,,,,,,,,,,,"SELECT cn.name AS company_name, lt.link AS li...",42875.0,42829.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        2335 non-null   int64  
 1   FILENAME          2335 non-null   object 
 2   TQ                2335 non-null   float64
 3   TQ COUNT          2335 non-null   int64  
 4   TBSCAN            2335 non-null   float64
 5   TBSCAN COUNT      2335 non-null   int64  
 6   SORT              0 non-null      float64
 7   SORT COUNT        0 non-null      float64
 8   FILTER            0 non-null      float64
 9   FILTER COUNT      0 non-null      float64
 10  HS JOIN           2335 non-null   float64
 11  HS JOIN COUNT     2335 non-null   int64  
 12  TEMP              0 non-null      float64
 13  TEMP COUNT        0 non-null      float64
 14  GRPBY             0 non-null      float64
 15  GRPBY COUNT       0 non-null      float64
 16  UNIQUE            0 non-null      float64


## Dropping non-feature columns

In [6]:
cols_non_features = ['Unnamed: 0', 'FILENAME', 'Queries']
df.drop(columns=cols_non_features, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TQ                2335 non-null   float64
 1   TQ COUNT          2335 non-null   int64  
 2   TBSCAN            2335 non-null   float64
 3   TBSCAN COUNT      2335 non-null   int64  
 4   SORT              0 non-null      float64
 5   SORT COUNT        0 non-null      float64
 6   FILTER            0 non-null      float64
 7   FILTER COUNT      0 non-null      float64
 8   HS JOIN           2335 non-null   float64
 9   HS JOIN COUNT     2335 non-null   int64  
 10  TEMP              0 non-null      float64
 11  TEMP COUNT        0 non-null      float64
 12  GRPBY             0 non-null      float64
 13  GRPBY COUNT       0 non-null      float64
 14  UNIQUE            0 non-null      float64
 15  UNIQUE COUNT      0 non-null      float64
 16  UNION             0 non-null      float64


# Checking Missing Values

In [8]:
df.isnull().sum()

TQ                     0
TQ COUNT               0
TBSCAN                 0
TBSCAN COUNT           0
SORT                2335
SORT COUNT          2335
FILTER              2335
FILTER COUNT        2335
HS JOIN                0
HS JOIN COUNT          0
TEMP                2335
TEMP COUNT          2335
GRPBY               2335
GRPBY COUNT         2335
UNIQUE              2335
UNIQUE COUNT        2335
UNION               2335
UNION COUNT         2335
NHJOIN              2335
NHJOIN COUNT        2335
ESTIMATED_MEMORY       3
ACTUAL_MEMORY          3
dtype: int64

## Dropping columns with all missing values

In [9]:
df.dropna(axis=1, how='all', inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TQ                2335 non-null   float64
 1   TQ COUNT          2335 non-null   int64  
 2   TBSCAN            2335 non-null   float64
 3   TBSCAN COUNT      2335 non-null   int64  
 4   HS JOIN           2335 non-null   float64
 5   HS JOIN COUNT     2335 non-null   int64  
 6   ESTIMATED_MEMORY  2332 non-null   float64
 7   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(5), int64(3)
memory usage: 146.1 KB


## Dropping rows with missing values

In [11]:
df.dropna(axis=0, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2332 entries, 0 to 2334
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TQ                2332 non-null   float64
 1   TQ COUNT          2332 non-null   int64  
 2   TBSCAN            2332 non-null   float64
 3   TBSCAN COUNT      2332 non-null   int64  
 4   HS JOIN           2332 non-null   float64
 5   HS JOIN COUNT     2332 non-null   int64  
 6   ESTIMATED_MEMORY  2332 non-null   float64
 7   ACTUAL_MEMORY     2332 non-null   float64
dtypes: float64(5), int64(3)
memory usage: 164.0 KB


In [13]:
df.head()

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0.0047,1,13134630.0,9,71.773664,8,130320.0,62824.0
1,0.0047,1,13134630.0,9,71.773664,8,62824.0,62824.0
2,0.005288,1,13134630.0,9,71.829424,8,62816.0,62812.0
3,0.0047,1,13134630.0,9,71.773664,8,62812.0,62813.0
4,1.6e-05,1,13050370.0,9,96.419862,8,42875.0,42829.0


In [14]:
df.shape

(2332, 8)

In [15]:
df.columns
# Instead of 'ESTIMATED_SORT_SHRHEAP_TOP' and 'SORT_SHRHEAP_TOP', 
# here there is 'ESTIMATED_MEMORY'and 'ACTUAL_MEMORY'

Index(['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'],
      dtype='object')

# Converting these values to MB

In [16]:
df['ESTIMATED_MEMORY'] = df['ESTIMATED_MEMORY'] * 4000 / 1000000
df['ACTUAL_MEMORY'] = df['ACTUAL_MEMORY'] * 4000 / 1000000

In [17]:
df.head()

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,ESTIMATED_MEMORY,ACTUAL_MEMORY
0,0.0047,1,13134630.0,9,71.773664,8,521.28,251.296
1,0.0047,1,13134630.0,9,71.773664,8,251.296,251.296
2,0.005288,1,13134630.0,9,71.829424,8,251.264,251.248
3,0.0047,1,13134630.0,9,71.773664,8,251.248,251.252
4,1.6e-05,1,13050370.0,9,96.419862,8,171.5,171.316


In [18]:
df.columns

Index(['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'ESTIMATED_MEMORY', 'ACTUAL_MEMORY'],
      dtype='object')

In [19]:
df = df.copy()[['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'ESTIMATED_MEMORY', 'ACTUAL_MEMORY']]

In [20]:
df.rename(columns={'ESTIMATED_MEMORY':'db2', 'ACTUAL_MEMORY':'actual'}, inplace=True)

In [21]:
df.to_csv('job2-clean.csv', index=False)

# Partitioning the dataset into training and test sets

In [22]:
print(df.columns)

Index(['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'db2', 'actual'],
      dtype='object')


In [23]:
X_cols = ['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT',
       'db2']
target = ['actual']

In [24]:
X, y = df[X_cols], df[target]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.2, 
                     random_state=0)

In [25]:
features = ['TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'HS JOIN', 'HS JOIN COUNT']

# Bringing a subset of the features onto the same scale - using ColumnTransformer

In [26]:
c_transf = ColumnTransformer([
        ('stdscaler', StandardScaler(), features)
    ], remainder='passthrough')

X_train_std = c_transf.fit_transform(X_train).astype(float)

X_test_std = c_transf.transform(X_test).astype(float)

In [27]:
X_train_std.shape

(1865, 7)

In [28]:
X_train.head(1)

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,db2
812,2.3e-05,1,12666540.0,8,17.359695,7,526.412


In [29]:
X_train_std[0]

array([-1.61583082e-01,  0.00000000e+00, -1.20888799e+00, -1.22065009e+00,
       -6.34712502e-02, -1.22065009e+00,  5.26412000e+02])

In [30]:
X_test_std.shape

(467, 7)

# Converting y into np arrays

In [31]:
y_train = y_train.values
y_test = y_test.values

# Saving training and test datasets

In [32]:
X_train.shape

(1865, 7)

In [33]:
X_train_std.shape

(1865, 7)

In [34]:
type(X_train_std)

numpy.ndarray

In [35]:
X_train_clean = pd.DataFrame(X_train_std, columns=X_train.columns)
X_train_clean['actual'] = y_train

In [36]:
X_train_clean.head(5)

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,db2,actual
0,-0.161583,0.0,-1.208888,-1.22065,-0.063471,-1.22065,526.412,39.804
1,0.082087,0.0,1.008924,0.819236,-0.057916,0.819236,251.248,251.248
2,-0.118526,0.0,-1.208808,-1.22065,-0.037267,-1.22065,532.976,62.98
3,0.073651,0.0,1.008924,0.819236,-0.057918,0.819236,251.248,251.248
4,-0.162488,0.0,-1.20889,-1.22065,-0.06512,-1.22065,526.412,56.992


In [None]:
X_train_clean.to_csv('job2_train_clean.csv', index=False)

In [38]:
X_test_clean = pd.DataFrame(X_test_std, columns=X_test.columns)
X_test_clean['actual'] = y_test

In [39]:
X_test_clean.head(5)

Unnamed: 0,TQ,TQ COUNT,TBSCAN,TBSCAN COUNT,HS JOIN,HS JOIN COUNT,db2,actual
0,0.065215,0.0,1.008923,0.819236,-0.05792,0.819236,251.248,251.248
1,0.05678,0.0,1.008923,0.819236,-0.057922,0.819236,251.248,251.252
2,-0.162551,0.0,-1.20889,-1.22065,-0.065235,-1.22065,526.412,57.124
3,-0.145175,0.0,0.635903,0.819236,-0.057915,0.819236,251.248,251.248
4,-0.161825,0.0,-1.208888,-1.22065,-0.063914,-1.22065,526.412,39.804


In [None]:
X_test_clean.to_csv('job2_test_clean.csv', index=False)