In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'computer_activity'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'cpu_act.data'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
col_names = [
    'lread', 
    'lwrite', 
    'scall', 
    'sread', 
    'swrite', 
    'fork', 
    'exec', 
    'rchar', 
    'wchar', 
    'pgout', 
    'ppgout', 
    'pgfree', 
    'pgscan', 
    'atch', 
    'pgin', 
    'ppgin', 
    'pflt', 
    'vflt', 
    'runqsz', 
    'freemem', 
    'freeswap', 
    'usr',   
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None,names=col_names)
data.head()

Unnamed: 0,lread,lwrite,scall,sread,swrite,fork,exec,rchar,wchar,pgout,...,pgscan,atch,pgin,ppgin,pflt,vflt,runqsz,freemem,freeswap,usr
0,6,2,1036,103,114,1.0,1.0,172076,355965,0.0,...,0.0,0.0,2.0,4.0,73.6,89.0,2.0,6527,1851864,90
1,1,0,2165,205,101,0.4,1.2,43107,44139,4.8,...,181.4,0.2,85.4,88.2,19.4,161.8,3.0,130,1131931,88
2,62,77,3806,258,166,1.4,1.4,492142,268706,4.8,...,79.2,2.2,7.6,12.2,68.0,218.8,5.2,256,1314590,85
3,5,0,4721,256,177,0.99,2.58,524787,174964,14.51,...,189.86,1.99,4.17,24.85,95.63,248.91,1.0,233,972606,81
4,42,55,3949,249,244,2.6,4.6,197289,529200,4.2,...,0.0,1.4,1.8,2.2,219.6,297.2,3.4,331,1013805,79


In [6]:
id_col = "id"
target_col = "usr"

In [7]:
data[target_col].value_counts()

90    459
91    448
92    426
94    421
93    411
97    410
96    410
95    405
88    384
98    378
89    376
87    338
86    283
0     283
85    254
84    252
83    230
81    201
82    187
80    166
79    150
77    144
78    126
76    119
75    104
74     96
72     77
73     73
99     60
69     51
71     49
68     46
70     42
67     39
66     36
63     32
64     27
62     27
65     25
59     23
60     20
58     17
61     16
57     14
56     11
55     10
1      10
54      7
53      5
51      4
50      4
52      2
46      1
49      1
48      1
2       1
Name: usr, dtype: int64

# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,lread,lwrite,scall,sread,swrite,fork,exec,rchar,wchar,pgout,...,pgscan,atch,pgin,ppgin,pflt,vflt,runqsz,freemem,freeswap,usr
5670,1,0,400,45,49,1.2,1.2,40302,24635,0.0,...,0.0,0.0,0.4,0.6,86.2,106.0,2.0,4062,1779392,91
5369,14,0,806,101,60,0.4,0.4,175267,8453,3.39,...,91.42,0.6,9.58,27.15,23.15,99.2,2.4,137,1735008,84
2111,13,1,4137,197,96,10.2,33.6,209720,39281,6.6,...,9.4,3.8,3.8,3.8,542.6,838.0,1.0,226,1032648,62
6659,0,0,1498,172,43,0.2,0.2,13410,23242,0.0,...,0.0,0.0,0.0,0.0,15.6,16.8,1.6,5805,1818256,94
5227,19,9,3744,393,329,6.8,4.4,209666,21515,1.0,...,37.2,0.6,6.2,6.2,356.4,572.2,1.3,235,1073950,80


# Insert Id Column

In [9]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

      id  lread  lwrite  scall  sread  swrite  fork  exec   rchar  wchar  ...  \
5670   0      1       0    400     45      49   1.2   1.2   40302  24635  ...   
5369   1     14       0    806    101      60   0.4   0.4  175267   8453  ...   
2111   2     13       1   4137    197      96  10.2  33.6  209720  39281  ...   
6659   3      0       0   1498    172      43   0.2   0.2   13410  23242  ...   
5227   4     19       9   3744    393     329   6.8   4.4  209666  21515  ...   

      pgscan  atch  pgin  ppgin    pflt   vflt  runqsz  freemem  freeswap  usr  
5670    0.00   0.0  0.40   0.60   86.20  106.0     2.0     4062   1779392   91  
5369   91.42   0.6  9.58  27.15   23.15   99.2     2.4      137   1735008   84  
2111    9.40   3.8  3.80   3.80  542.60  838.0     1.0      226   1032648   62  
6659    0.00   0.0  0.00   0.00   15.60   16.8     1.6     5805   1818256   94  
5227   37.20   0.6  6.20   6.20  356.40  572.2     1.3      235   1073950   80  

[5 rows x 23 columns]


# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(7372, 23) (820, 23)


# JSON inference request instance

In [12]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'atch': 0.0,
                'exec': 0.2,
                'fork': 0.2,
                'freemem': 1333,
                'freeswap': 1713242,
                'id': '5670',
                'lread': 3,
                'lwrite': 1,
                'pflt': 22.6,
                'pgfree': 0.0,
                'pgin': 0.2,
                'pgout': 0.0,
                'pgscan': 0.0,
                'ppgin': 0.2,
                'ppgout': 0.0,
                'rchar': 45128,
                'runqsz': 2.0,
                'scall': 540,
                'sread': 96,
                'swrite': 88,
                'vflt': 17.4,
                'wchar': 56460}]}
