In [13]:
import pandas as pd
import urllib.parse
import os

In [14]:
csv_path = 'loadtest_stats_history.csv'
parquet_dir_path = 'loadtest_stats_history'

if not os.path.isdir(parquet_dir_path): os.mkdir(parquet_dir_path)

def df_to_parquet(df, target_dir, chunk_size=1000000, **parquet_wargs):
    """Writes pandas DataFrame to parquet format with pyarrow.

    Args:
        df: DataFrame
        target_dir: local directory where parquet files are written to
        chunk_size: number of rows stored in one chunk of parquet file. Defaults to 1000000.
    """    
    for i in range(0, len(df), chunk_size):
        print(i)
        slc = df.iloc[i : i + chunk_size]
        chunk = int(i/chunk_size)
        fname = os.path.join(target_dir, f"part_{chunk:04d}.parquet.gzip")
        slc.to_parquet(fname, engine="pyarrow", compression='gzip', **parquet_wargs)

In [15]:
df = pd.read_csv(csv_path)

In [16]:
df['is_agg'] = df['Name'].str.startswith('Aggregated')
df['is_user_get'] = df['Name'].str.startswith('/user/get/')
df['is_user_search'] = df['Name'].str.startswith('/user/search')

In [17]:
df['op_type'] = df.apply(lambda x: 'agg' if x['is_agg'] else 'get' if x['is_user_get'] else 'search' if x['is_user_search'] else None, axis=1)

In [18]:
df['op_type'].value_counts(dropna=False)

get       4607705
search    4303756
agg           414
Name: op_type, dtype: int64

In [19]:
df.loc[df.is_user_get, 'get_id'] = df.loc[df.is_user_get, 'Name'].str.slice(10)

In [20]:
df.loc[df.is_user_search, 'search_fname'] = df.loc[df.is_user_search, 'Name'].str.slice(13).str.split('&').apply(lambda x: x[0])
df.loc[df.is_user_search, 'search_fname'] = df.loc[df.is_user_search, 'search_fname'].str.split('=').apply(lambda x: urllib.parse.unquote(x[1]))

In [21]:
df.loc[df.is_user_search, 'search_lname'] = df.loc[df.is_user_search, 'Name'].str.slice(13).str.split('&').apply(lambda x: x[1])
df.loc[df.is_user_search, 'search_lname'] = df.loc[df.is_user_search, 'search_lname'].str.split('=').apply(lambda x: urllib.parse.unquote(x[1]))

In [22]:
df.drop(columns=['Name'], inplace=True)

In [23]:
df.groupby(['User Count', 'op_type'], dropna=False)[['Timestamp']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Timestamp
User Count,op_type,Unnamed: 2_level_1
0,agg,14
0,get,215313
0,search,187252
1,agg,159
1,get,299570
1,search,292202
10,agg,107
10,get,985938
10,search,962974
100,agg,73


In [36]:
# df

Unnamed: 0,Timestamp,User Count,Type,Requests/s,Failures/s,50%,66%,75%,80%,90%,...,Total Min Response Time,Total Max Response Time,Total Average Content Size,is_agg,is_user_get,is_user_search,op_type,get_id,search_fname,search_lname
0,1688988589,0,,0.0,0.0,,,,,,...,0.000000,0.000000,0.0,True,False,False,agg,,,
1,1688988590,0,,0.0,0.0,,,,,,...,0.000000,0.000000,0.0,True,False,False,agg,,,
2,1688988591,0,,0.0,0.0,,,,,,...,0.000000,0.000000,0.0,True,False,False,agg,,,
3,1688988592,0,,0.0,0.0,,,,,,...,0.000000,0.000000,0.0,True,False,False,agg,,,
4,1688988593,0,,0.0,0.0,,,,,,...,0.000000,0.000000,0.0,True,False,False,agg,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8911870,1688989342,0,GET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1933.252333,1933.252333,7289.0,False,False,True,search,,Полина,Большакова
8911871,1688989342,0,GET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,19.109541,19.109541,7551.0,False,False,True,search,,Полина,Бородина
8911872,1688989342,0,GET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27.534667,27.534667,6927.0,False,False,True,search,,Полина,Бочарова
8911873,1688989342,0,GET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34.501792,34.501792,5683.0,False,False,True,search,,Полина,Булгакова


In [35]:
df_to_parquet(df, parquet_dir_path)

0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
