In [1]:
import os
import sys
import re
import pandas as pd
import numpy as np
import argparse
import datetime
import time
from sklearn import metrics
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
dataset_dir = '../data/raw'
meta_dir = '../data/metadata'
size_cdf_file = '../data/cdf/size_cdf.csv'
interval_cdf_file = '../data/cdf/interval_cdf.csv'
apps = ['app_11', 'app_49', 'app_131', 'app_147', 'app_182', 'app_199', 'app_207', 'app_212', 'app_265', 'app_275']
size_dir = '../data/size'
interval_dir = '../data/interval'
os.makedirs(size_dir, exist_ok=True)
os.makedirs(meta_dir, exist_ok=True)
os.makedirs(interval_dir, exist_ok=True)

In [3]:
def get_size_cdf(dataset_dir, out_file='../data/cdf/size_cdf.csv'):
    sizes = []
    apps = os.listdir(dataset_dir)
    for app in apps:
        app_dir = os.path.join(dataset_dir, app)
        for file in os.listdir(app_dir):
                file_path = os.path.join(app_dir, file)
                data = pd.read_csv(file_path)
                sizes.extend(list(data['size'].values))
    bins = np.concatenate(([40, 50, 75], np.arange(100, 2500, 100), np.arange(2500, 5000, 250), np.arange(5000, 10000, 500), np.arange(10000, 25000, 1000), np.arange(25000, 50000, 2500), np.arange(50000, 100000, 5000), np.arange(100000, 500000, 10000), np.arange(500000, 2200000, 25000)))
    counts, bins = np.histogram(sizes, bins=bins)
    cdf = np.cumsum(counts)
    cdf = cdf / cdf[-1]
    size_cdf = pd.DataFrame({'size': bins, 'cdf': [0] + list(cdf)})
    size_cdf.to_csv(out_file, index=False, float_format='%.6f')


def get_interval_cdf(dataset_dir, out_file='../data/cdf/interval_cdf.csv'):
    intervals = []
    apps = os.listdir(dataset_dir)
    for app in apps:
        app_dir = os.path.join(dataset_dir, app)
        for file in os.listdir(app_dir):
            file_path = os.path.join(app_dir, file)
            data = pd.read_csv(file_path)
            timestatmps = data['time'].values
            intervals.extend(list(np.diff(timestatmps)))
    bins = np.concatenate((np.arange(0, 1.9e-5, 1e-6), np.arange(2e-5, 2e-4, 1e-5), np.arange(2e-4, 5e-4, 2.5e-5), np.arange(5e-4, 1e-3, 5e-5), np.arange(1e-3, 2e-3, 1e-4), np.arange(2e-3, 5e-3, 2.5e-4), np.arange(5e-3, 1e-2, 5e-4), np.arange(1e-2, 2e-2, 1e-3), np.arange(2e-2, 5e-2, 2.5e-3), np.arange(5e-2, 1e-1, 5e-3), np.arange(1e-1, 2e-1, 1e-2), np.arange(2e-1, 5e-1, 2.5e-2), np.arange(5e-1, 1, 5e-2), np.arange(1, 2, 1e-1), np.arange(2, 5, 2.5e-1), np.arange(5, 10, 5e-1), np.arange(10, 25, 1), np.arange(25, 50, 2.5), np.arange(50, 71, 5)))
    counts, bins = np.histogram(intervals, bins=bins)
    cdf = np.cumsum(counts)
    cdf = cdf / cdf[-1]
    interval_cdf = pd.DataFrame({'interval': bins, 'cdf': [0] + list(cdf)})
    interval_cdf.to_csv(out_file, index=False, float_format='%.6f')
    
def get_cdf(x, cdf):
    return np.sum([cdf <= x]) - 1

def extract_number(file_path):
    match = re.search(r'/(\d+)\.csv$', file_path)
    if match:
        return int(match.group(1))
    return None

In [4]:
get_size_cdf(dataset_dir, size_cdf_file)
get_interval_cdf(dataset_dir, interval_cdf_file)
size_cdf = pd.read_csv(size_cdf_file)
interval_cdf = pd.read_csv(interval_cdf_file)

In [10]:
for app in apps:
    size_file = os.path.join(size_dir, '{}.txt'.format(app))
    interval_file = os.path.join(interval_dir, '{}.txt'.format(app))
    meta_file = os.path.join(meta_dir, '{}.csv'.format(app))
    size_f = open(size_file, 'w')
    interval_f = open(interval_file, 'w')
    meta_f = open(meta_file, 'w')
    meta_f.write('load,mean_size,mean_interval,start_time,end_time,flow_num\n')

    app_dir = os.path.join(dataset_dir, app)
    indices = {}
    files = os.listdir(app_dir)
    files = [os.path.join(app_dir, f) for f in files]
    files = sorted(files, key=extract_number)
    for file in files:
        file_path = file#os.path.join(app_dir, file)
        data = pd.read_csv(file_path)
        size_index = data['size'].apply(lambda x: get_cdf(x, size_cdf['size'].values))
        interval = np.diff(data['time'].values)
        interval_index = np.array([get_cdf(x, interval_cdf['interval'].values) for x in interval])
        indices[file] = size_index
        size_f.write(','.join(size_index.values.astype(str)) + '\n')
        interval_f.write(','.join(interval_index.astype(str)) + '\n')
        sizes = data['size'].values
        times = data['time'].values
        intervals = np.diff(times)
        time_scale = data['time'].values[-1] - data['time'].values[0]
        meta_f.write("{},{},{},{},{},{}\n".format(np.sum(sizes) / time_scale, np.mean(sizes), np.mean(intervals), times[0], times[-1], len(data)))
        print(app, file, "{:.2f},{:.2f},{:.2f},{:.2f},{:.2f},{:.2f}".format(np.sum(sizes) / time_scale, np.mean(sizes), np.mean(intervals), times[0], times[-1], len(data)))

    size_f.close()
    interval_f.close()
    meta_f.close()

app_11 ../data/raw/app_11/0.csv 212529.01,28384.49,0.13,0.07,119.87,897.00
app_11 ../data/raw/app_11/3.csv 22535.60,3277.88,0.15,0.11,119.67,822.00
app_11 ../data/raw/app_11/4.csv 175564.60,27314.75,0.16,0.04,119.84,770.00
app_11 ../data/raw/app_11/10.csv 21457.75,3091.24,0.14,0.01,119.87,832.00
app_11 ../data/raw/app_11/11.csv 185689.53,26150.64,0.14,0.06,119.90,851.00
app_11 ../data/raw/app_11/15.csv 223965.13,34672.51,0.16,0.00,119.83,774.00
app_11 ../data/raw/app_11/20.csv 185955.40,30326.89,0.16,0.15,119.86,734.00
app_11 ../data/raw/app_11/21.csv 237694.09,30593.09,0.13,0.27,119.71,928.00
app_11 ../data/raw/app_11/23.csv 21741.14,3164.39,0.15,0.03,119.96,824.00
app_11 ../data/raw/app_11/25.csv 208172.97,31418.13,0.15,0.00,119.68,793.00
app_11 ../data/raw/app_11/27.csv 2908.76,1045.55,0.36,0.09,119.42,332.00
app_11 ../data/raw/app_11/28.csv 19450.82,3050.65,0.16,0.00,119.83,764.00
app_11 ../data/raw/app_11/30.csv 154009.49,25100.49,0.16,0.04,119.99,736.00
app_11 ../data/raw/app_11/

In [9]:
import pandas as pd

# 示例dict
data = {
    'key1': [1, 2, 3],
    'key2': [4, 5, 6],
    'key3': [7, 8, 9]
}

# 转换为DataFrame
df = pd.DataFrame(data).T  # .T是转置操作
df.reset_index(inplace=True)  # 将index列转换为普通列
df.columns = ['key', 'value1', 'value2', 'value3']  # 重命名列

print(df)

    key  value1  value2  value3
0  key1       1       2       3
1  key2       4       5       6
2  key3       7       8       9
