# Imports and Setup

In [10]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

pd.set_option('display.max_columns', None)

# Function Definitions

In [11]:
# loading data
def load_data():
    df_train = pd.read_csv('tpcds_train_clean.csv')
    df_test = pd.read_csv('tpcds_test_clean.csv')
    
    return df_train, df_test

# clustering
def get_clusters(k, data, km):
    X = data.drop(columns=['db2','actual']);
    if km != None:
        print('clustering test dataset')
        y_kmeans = km.predict(X)
        
    else:
        print('clustering train dataset')
        km = KMeans(n_clusters = k, 
                        init='k-means++', 
                        n_init=10, 
                        max_iter=300, 
                        random_state=0)
        km.fit(X)
        y_kmeans = km.predict(X)
        
    print('Distortion: %.2f' % km.inertia_)
    df_train = data.copy();
    df_train['cluster'] = np.nan
    for i,e in enumerate(y_kmeans):
        df_train['cluster'].loc[i] = e;
    return km, df_train

# creating workloads
def create_workload(batch_size, data, k):
    # keeping only the columns that I need
    df_data = data[['db2','actual','cluster']]
    df_data = df_data.astype({'cluster': 'int32'})

    # shuffle the dataframe
    df_data = df_data.sample(frac=1, ignore_index=True)

    # 1-hot encode the cluster column
    df_data = pd.get_dummies(df_data, columns=['cluster'])
    
    # create a dataframe for all workloads
    df_workloads = pd.DataFrame(columns=df_data.columns)
    
    num_batches = int(np.floor(df_data.shape[0] / batch_size))
    #num_batches = 2

    # creating this pair of variables to store the beginning and ending indices for each batch
    first_index = 0
    last_index = 0 
        
    for i in range(num_batches):
        if i > 0:
            first_index = last_index + 1
        
        last_index = first_index + batch_size - 1

        # selecting the rows between first_index and last_index (inclusive) for the present workload
        df_workload = df_data.loc[first_index:last_index, :]

        #print('workload queries:\n', df_workload)

        # summing the column values will create a series. coverting it to a dataframe using to_frame()
        # Transposing the dataframe will return to the orginal dataframe structure, however, all the 
        # column values will be aggregated
        df_workload = df_workload.sum(axis=0).to_frame().T
        #print('workload aggegated:\n', df_workload)

        # adding the new workload to the dataframe of all workloads
        df_workloads = pd.concat([df_workloads, df_workload], ignore_index=True)

    #print(df_workloads)
    return df_workloads

### TEST - create_workload function

In [12]:
# test create_workload() function
'''
df_train, df_test = load_data()
k = 5
km, df_train_clusters = get_clusters(k, df_train.loc[:100, :], None)
create_workload(3, df_train_clusters, k)
'''

'\ndf_train, df_test = load_data()\nk = 5\nkm, df_train_clusters = get_clusters(k, df_train.loc[:100, :], None)\ncreate_workload(3, df_train_clusters, k)\n'

# Execution

In [13]:
df_train, df_test = load_data()
k = 110
batch_sizes = [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

#batch_sizes = [2, 3, 5]
#batch_sizes = [2]

km, df_train_clusters = get_clusters(k, df_train, None)
km, df_test_clusters  = get_clusters(k, df_test, km)

for batch_size in batch_sizes:
    workload_train = create_workload(batch_size,df_train_clusters,k)
    workload_test = create_workload(batch_size,df_test_clusters,k)
    file_name_train = 'train_workloads_final_%s_clusters_%s_batch.csv' % (k, batch_size)
    file_name_test  = 'test_workloads_final_%s_clusters_%s_batch.csv' % (k, batch_size)
    workload_train.to_csv(file_name_train ,index=False)
    workload_test.to_csv(file_name_test ,index=False)
    print("batch = %s is done" % (batch_size))

clustering train dataset
Distortion: 10.22
clustering test dataset
Distortion: 10.22
batch = 1 is done
batch = 2 is done
batch = 3 is done
batch = 5 is done
batch = 10 is done
batch = 15 is done
batch = 20 is done
batch = 25 is done
batch = 30 is done
batch = 35 is done
batch = 40 is done
batch = 45 is done
batch = 50 is done
