In [11]:
import pandas as pd
import urllib.parse
import os

In [17]:
csv_path = 'loadtest_stats_history.csv'
parquet_dir_path = 'loadtest_stats_history'

if not os.path.isdir(parquet_dir_path): os.mkdir(parquet_dir_path)

def df_to_parquet(df, target_dir, chunk_size=1000000, **parquet_wargs):
    """Writes pandas DataFrame to parquet format with pyarrow.

    Args:
        df: DataFrame
        target_dir: local directory where parquet files are written to
        chunk_size: number of rows stored in one chunk of parquet file. Defaults to 1000000.
    """    
    for i in range(0, len(df), chunk_size):
        print(i)
        slc = df.iloc[i : i + chunk_size]
        chunk = int(i/chunk_size)
        fname = os.path.join(target_dir, f"part_{chunk:04d}.parquet.gzip")
        slc.to_parquet(fname, engine="pyarrow", compression='gzip', **parquet_wargs)

In [3]:
df = pd.read_csv(csv_path)

In [4]:
df['is_agg'] = df['Name'].str.startswith('Aggregated')
df['is_user_get'] = df['Name'].str.startswith('/user/get/')
df['is_user_search'] = df['Name'].str.startswith('/user/search')

In [5]:
df['op_type'] = df.apply(lambda x: 'agg' if x['is_agg'] else 'get' if x['is_user_get'] else 'search' if x['is_user_search'] else None, axis=1)

In [6]:
df['op_type'].value_counts(dropna=False)

get       4532846
search    4208899
agg           350
Name: op_type, dtype: int64

In [7]:
df.loc[df.is_user_get, 'get_id'] = df.loc[df.is_user_get, 'Name'].str.slice(10)

In [8]:
df.loc[df.is_user_search, 'search_fname'] = df.loc[df.is_user_search, 'Name'].str.slice(13).str.split('&').apply(lambda x: x[0])
df.loc[df.is_user_search, 'search_fname'] = df.loc[df.is_user_search, 'search_fname'].str.split('=').apply(lambda x: urllib.parse.unquote(x[1]))

In [9]:
df.loc[df.is_user_search, 'search_lname'] = df.loc[df.is_user_search, 'Name'].str.slice(13).str.split('&').apply(lambda x: x[1])
df.loc[df.is_user_search, 'search_lname'] = df.loc[df.is_user_search, 'search_lname'].str.split('=').apply(lambda x: urllib.parse.unquote(x[1]))

In [10]:
df.drop(columns=['Name'], inplace=True)

In [15]:
df.groupby(['User Count', 'op_type'], dropna=False)[['Timestamp']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Timestamp
User Count,op_type,Unnamed: 2_level_1
0,agg,1
0,get,36435
0,search,32470
1,agg,153
1,get,326390
1,search,315804
10,agg,91
10,get,1159075
10,search,1111547
100,agg,60


In [18]:
df_to_parquet(df, parquet_dir_path)

0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
