In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn import neural_network
import seaborn as sns
import matplotlib.cm as cm
import os

In [3]:
pd.set_option('display.max_columns', None)
k = 110

# 1. Loading training and test datasets

In [4]:
def load_data():
    df_train = pd.read_csv('tpcds_train_clean.csv')
    df_test = pd.read_csv('tpcds_test_clean.csv')
    
    return df_train, df_test

In [5]:
df_train, df_test = load_data()

# Clustering 

In [6]:
def get_clusters(k, data, km):
    X = data.drop(columns=['db2','actual']);
    if km != None:
        print('clustering test dataset')
        y_kmeans = km.predict(X)
        
    else:
        print('clustering train dataset')
        km = KMeans(n_clusters = k, 
                        init='k-means++', 
                        n_init=10, 
                        max_iter=300, 
                        random_state=0)
        km.fit(X)
        y_kmeans = km.predict(X)
        
    print('Distortion: %.2f' % km.inertia_)
    df_train = data.copy();
    df_train['cluster'] = np.nan
    for i,e in enumerate(y_kmeans):
        df_train['cluster'].loc[i] = e;
    
    return km, df_train

# Workload

In [7]:
def create_workload(batch_size, data, 
                    k):
    df_data = data[['db2','actual','cluster']]
    df_data = pd.get_dummies(df_data, columns=['cluster'])
    for i in range(0,k):
        c_name = 'cluster_%d.0' % i
        if c_name not in df_data.columns:
            df_data[c_name] = 0
            
    df_batches = pd.DataFrame(columns=df_data.columns)
    
    indices = np.linspace(0, data.shape[0]-1, data.shape[0], dtype=int)
    num_batches = int(np.floor(df_data.shape[0] / batch_size))
        
    for ibat in range(num_batches):
        start = (ibat * batch_size)
        end = (ibat * batch_size + batch_size) - 1
        
        ibat_Y = df_data.loc[indices[start:end], :]
        
        df_batches = df_batches.append(ibat_Y.sum(), ignore_index=True)
        
    return df_batches


# Create Workloads

In [9]:
#batch_sizes = [2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

batch_sizes = [2, 3, 5]

km, df_train_clusters = get_clusters(k, df_train, None)
km, df_test_clusters  = get_clusters(k, df_test, km)

for batch_size in batch_sizes:
    workload_train = create_workload(batch_size,df_train_clusters,k)
    workload_test = create_workload(batch_size,df_test_clusters,k)
    file_name_train = 'train_workloads_final_%s_clusters_%s_batch.csv' % (k, batch_size)
    file_name_test  = 'test_workloads_final_%s_clusters_%s_batch.csv' % (k, batch_size)
    workload_train.to_csv(file_name_train ,index=False)
    workload_test.to_csv(file_name_test ,index=False)
    print("batch = %s is done" % (batch_size))
    

clustering train dataset
Distortion: 10.22
clustering test dataset
Distortion: 10.22
batch = 35 is done
batch = 45 is done
