In [1]:
from figure_utils import save_table
import pandas as pd
# https://docs.google.com/spreadsheets/d/1irZHhFP2NfIXyPqcMDqKStY_DY6nldomHGwXuCSBl20/edit#gid=1226257673

In [26]:
df = pd.read_csv("tables/feature-engineering/table.tsv", sep="\t").sort_values("Dimension")

# Convert 'Dimension' column to categorical with specific order
df['Dimension'] = pd.Categorical(df['Dimension'], categories=['Data', 'Data \\& Model','Hardware', 'Model', 'Dependent'], ordered=True)

def col_to_math(col):
    return col.apply(lambda x: f"${x}$")
df['Type'] = df['Type'].fillna("N")
df['Symbol'] = col_to_math(df['Symbol'])
df['Formula'] = col_to_math(df['Formula'])

def to_latex(df, column_format=None):
    if column_format is None:
        column_format = "lp{0.35\linewidth}p{0.10\linewidth}>{\\footnotesize}p{0.2\linewidth}p{0.08\linewidth}p{0.15\linewidth}"
    tab = df.sort_values(["Dimension", "Feature"]).set_index(list(df.columns)).to_latex(
        index=True, escape=False, column_format=column_format, na_rep=""
    )
    tab = (
        tab.replace("$$nan$$", "")
        .replace("$nan$", "")
        .replace("\midrule", "\midrule\midrule")
        .replace("\cline{1-6} \cline{2-6} \cline{3-6} \cline{4-6} \cline{5-6}", "")
        .replace("\cline{2-6} \cline{3-6} \cline{4-6} \cline{5-6}\n", "")
        .replace("\cline{2-4} \cline{3-4}", "")
        .replace("\cline{1-4} \cline{2-4} \cline{3-4}", "")
        .replace("Dependent", "\\textbf{Dependent}")
        .replace("\cline{1-4}", "")
    )
    return tab

save_table(to_latex(df), "chapters/05_cost_estimation/auto-generated/feature-table.tex")

\begin{tabular}{lp{0.35\linewidth}p{0.10\linewidth}>{\footnotesize}p{0.2\linewidth}p{0.08\linewidth}p{0.15\linewidth}}
\toprule
Dimension & Feature & Symbol & Formula & Type & Notes \\
\midrule\midrule
\multirow[t]{10}{*}{Data} & Dataset size (rows, columns) & $r_T, c_T$ &  & N &  \\
 & Feature ratio & $\rho$ & $\frac{n_S}{\sum_{k=1}^p n_k} $ & N &  \\
 & Join type & $j_t$ &  & C &  \\
 & Selectivity & $\sigma$ & $\frac{\sum_{k=1}^{n}r_{S_k}}{r_T}$ & N &  \\
 & Sparsity & $e_T$ & $\frac{nnz(T)}{r_T\times c_T}$ & N &  \\
 & Sparsity ratio &  & $\frac{e_T}{e_S}$ & N &  \\
 & Tuple ratio & $\tau$ & $\frac{\sum_{k=1}^p d_k}{d_S}$ & N &  \\
 & \# Base tables & $n$ &  & N &  \\
 & \# Non-zero values & $nnz(T)$ & $nnz(S) = \sum_{k=1}^{n}nnz(S_k)$ & N &  \\
 & \# Sparse base tables ($e < 0.05$) & $q$ & $|\{S_k \in S| e_{S_k} < 0.05\}|$ & N & From \cite{MorpheusFI} \\

\multirow[t]{4}{*}{Data \& Model} & Complexity  & $M_{FLOP}$, $F_{FLOP}$ &  & N & For each operator \\
 & Complexity ratio &  &

In [28]:
to_keep = ['\\# Base tables', 'Sparsity ratio',
       'Tuple ratio', 'Dataset size (rows, columns)',
       'Sparsity','Complexity ratio', 'Memory ratio',
       'Execution Time', 'Performance ratio', 'Time saved',
       'Compute type', 'GPU memory bandwidth', 'GPU processing power',
        '\\# Iterations', 'Operator']
subset_df = df.drop(columns=["Type", "Notes"]).drop(df[~df.Feature.isin(to_keep)].index)

col_format = "llll"
save_table(to_latex(subset_df, column_format=col_format), "chapters/05_cost_estimation/auto-generated/feature-table-subset.tex")

\begin{tabular}{llll}
\toprule
Dimension & Feature & Symbol & Formula \\
\midrule\midrule
\multirow[t]{5}{*}{Data} & Dataset size (rows, columns) & $r_T, c_T$ &  \\

 & Sparsity & $e_T$ & $\frac{nnz(T)}{r_T\times c_T}$ \\

 & Sparsity ratio &  & $\frac{e_T}{e_S}$ \\

 & Tuple ratio & $\tau$ & $\frac{\sum_{k=1}^p d_k}{d_S}$ \\

 & \# Base tables & $n$ &  \\
 
\multirow[t]{2}{*}{Data \& Model} & Complexity ratio &  & $\frac{FLOP_M}{FLOP_F}$ \\

 & Memory ratio &  & $\frac{\text{bytes}_M}{\text{bytes}_F}$ \\
 
\multirow[t]{2}{*}{Hardware} & Compute type &  &  \\

 & GPU memory bandwidth &  &  \\
 
\multirow[t]{2}{*}{Model} & Operator &  &  \\

 & \# Iterations & $iter$ &  \\
 
\multirow[t]{3}{*}{\textbf{Dependent}} & Execution Time & $\text{Time}_M$, $\text{Time}_F$ &  \\

 & Performance ratio &  & $\frac{\text{Time}_M}{\text{Time}_F}$ \\

 & Time saved &  & $\text{Time}_M - \text{Time}_F$ \\
 
\bottomrule
\end{tabular}

