In [1]:
import os
import pandas as pd
import numpy as np
import json
import uuid
import random
from time import time

In [2]:
# random task_id
cur_id = -1
def gen_task_id():
    global cur_id
    cur_id += 1
    return cur_id

def gen_work_id():
    return random.randint(0,int(2**32))

# random start time in num milliseconds
def gen_start_time(mu):
    return int(time()*1e3 + random.gauss(mu, 400))

def gen_end_time(start, mu):
    return int(start + random.gauss(mu, 400))

def gen_music(mu):
    roll = random.gauss(mu, 0.5)
    if roll <= 0.5:
        return 0
    else:
        return 1
    
def gen_interruptions(mu):
    return np.abs(int(random.gauss(mu, 3)))

def gen_workspace_volume(mu):
    roll = random.gauss(mu, 0.5)
    if roll > 1:
        return 1
    if roll < 0:
        return 0
    return roll
    
def gen_meetings(mu):
    return np.abs(int(random.gauss(mu, 3)))

def gen_breaks(mu):
    return np.abs(int(random.gauss(mu, 5)))

def gen_progress(mu):
    val = np.abs(random.gauss(mu, 0.2))
    if val > 1:
        val = 1
    return val

def gen_row(time_mu, 
            music_mu, 
            interruptions_mu, 
            volume_mu, 
            meetings_mu, 
            breaks_mu, 
            progress_mu):
    start = gen_start_time(time_mu)
    return np.array([
        gen_task_id(),
        gen_work_id(),
        start,
        gen_end_time(start, time_mu),
        gen_music(music_mu),
        0,
        gen_interruptions(interruptions_mu),
        gen_workspace_volume(volume_mu),
        gen_meetings(meetings_mu),
        gen_breaks(breaks_mu),
        gen_progress(progress_mu),
    ])

def gen_quiet_row():
    return gen_row(time_mu=500,
                    music_mu=0.2,
                    interruptions_mu=3,
                    volume_mu=0.1,
                    meetings_mu=2,
                    breaks_mu=6,
                    progress_mu=0.7
                   )

def gen_loud_row():
    return gen_row(time_mu=500,
                    music_mu=0.7,
                    interruptions_mu=7,
                    volume_mu=0.7,
                    meetings_mu=7,
                    breaks_mu=1,
                    progress_mu=0.2
                   )

In [5]:
columns = ['task_id',
           'work_id',
           'start_time',
           'end_time',
           'music',
           'finished',
           'interruptions',
           'noise', 
           'meetings',
           'breaks',
           'progress']

In [6]:
len(columns)

11

In [7]:
loud_dataset = np.array([gen_loud_row() for x in range(1000)])
loud_df = pd.DataFrame(data=loud_dataset, columns=columns)

quiet_dataset = np.array([gen_quiet_row() for x in range(1000)])
quiet_df = pd.DataFrame(data=quiet_dataset, columns=columns)

In [8]:
quiet_df.head()

Unnamed: 0,task_id,work_id,start_time,end_time,music,finished,interruptions,noise,meetings,breaks,progress
0,1000.0,3784285000.0,1541303000000.0,1541303000000.0,0.0,0.0,6.0,0.493477,3.0,0.0,0.452092
1,1001.0,26327420.0,1541303000000.0,1541303000000.0,0.0,0.0,3.0,0.513683,3.0,3.0,0.575144
2,1002.0,2826882000.0,1541303000000.0,1541303000000.0,0.0,0.0,7.0,0.0,1.0,4.0,0.769796
3,1003.0,496667600.0,1541303000000.0,1541303000000.0,0.0,0.0,3.0,0.692333,0.0,9.0,0.777183
4,1004.0,1817932000.0,1541303000000.0,1541303000000.0,0.0,0.0,3.0,0.077958,5.0,11.0,1.0


In [9]:
DATA_DIR = "data"

In [10]:
loud_df.to_json(os.path.join(DATA_DIR, "loud.json"))
loud_df.to_csv(os.path.join(DATA_DIR, "loud.csv"), index=False)
quiet_df.to_json(os.path.join(DATA_DIR, "quiet.json"))
quiet_df.to_csv(os.path.join(DATA_DIR, "quiet.csv"), index=False)

In [15]:
# need to disable date conversion to keep using unix time
test_load_df = pd.read_json(os.path.join(DATA_DIR, "quiet.json"), convert_dates=False)

In [16]:
test_load_df.head()

Unnamed: 0,breaks,end_time,finished,interruptions,meetings,music,noise,progress,start_time,task_id,work_id
0,0,1541303061918,0,6,3,0,0.493477,0.452092,1541303061283,1000,3784284505
1,3,1541303061792,0,3,3,0,0.513683,0.575144,1541303061704,1001,26327424
10,9,1541303062415,0,0,3,1,0.001718,0.862475,1541303061960,1010,1047167674
100,8,1541303061656,0,2,2,0,0.147226,0.650836,1541303061683,1100,2996770684
101,10,1541303061608,0,2,3,1,0.0,1.0,1541303060502,1101,684580325


In [17]:
x = test_load_df.iloc[0]

In [18]:
x.to_json()

'{"breaks":0.0,"end_time":1541303061918.0,"finished":0.0,"interruptions":6.0,"meetings":3.0,"music":0.0,"noise":0.4934773453,"progress":0.4520924592,"start_time":1541303061283.0,"task_id":1000.0,"work_id":3784284505.0}'