In [1]:
import pandas as pd
import numpy as np
import os
# 读取训练集文件名
train_file_names = np.load('train_file_names.npy')
test_file_names = np.load('test_file_names.npy')
file_names = np.load('file_names.npy')
labels_file_names = np.load('labels_names.npy')
prefix_path = '../events_csv/'

# 数据清洗
> 作者给出的事件文件时间戳意义不明，经过如下清洗后，我们能够顺利将标签中的时间戳与动作真正时间戳对应上

In [2]:
# 将第一个事件发生的时间戳置0，同时除去噪声点
def data_clean(file_path,new_file_path,start_time):
    reader = pd.read_csv(file_path,header=1,usecols=['timestamp', 'x','y'],chunksize=10000)
    cols = pd.DataFrame(data=[['timestamp','x','y']])
    # 将列名写入csv文件
    cols.to_csv(new_file_path,header=False,index=False)    
    # 每次读取 10000 行，chunk是DataFrame
    for chunk in reader:
        chunk.iloc[:,0] = chunk.iloc[:,0] - start_time
        real_index = 0
        # 当前块的最后一行时间戳小于0，说明往下的数据都是噪声点，可以抛弃
        if chunk.iloc[-1,0] < 0:
            # 此块行索引之前的元素进行保存，之后的元素抛弃
            while chunk.iloc[real_index,0] >= 0:
                real_index = real_index + 1
            chunk.iloc[:real_index,:].to_csv(new_file_path,mode='a',header=False,index=False)
            break
        else:
            # 所有事件点都是有用的，写入清洗文件中
            chunk.to_csv(new_file_path,mode='a',header=False,index=False)

In [3]:
# 对训练集数据清洗
for name in train_file_names:
    file_path = prefix_path + name
    start_time = pd.read_csv(file_path,header=1,usecols=['timestamp'],nrows=1).iloc[0,0]
    # 数据清洗并写入到新文件中
    data_clean(file_path,'../events_csv_clean/'+name,start_time)

In [4]:
# 对测试集数据清洗
for name in test_file_names:
    file_path = prefix_path + name
    start_time = pd.read_csv(file_path,header=1,usecols=['timestamp'],nrows=1).iloc[0,0]
    # 数据清洗并写入到新文件中
    data_clean(file_path,'../events_csv_clean/'+name,start_time)

In [7]:
# 对标签数据进行清洗
for name in labels_file_names:
    cols = pd.DataFrame(data=[['class','startTime_usec','endTime_usec']])
    # 将列名写入csv文件
    cols.to_csv('../labels_clean/'+name,header=False,index=False)
    label = pd.read_csv('../labels/'+name)
    start_time = label.iloc[0,1]
    label.iloc[:,1] = label.iloc[:,1] - start_time
    label.iloc[:,2] = label.iloc[:,2] - start_time
    label.to_csv('../labels_clean/'+name,mode='a',header=False,index=False)

# 按照标签对对应事件进行切割

In [13]:
# 给每个class文件提前准备好列名，后续可专注于处理数据
cols = pd.DataFrame(data=[['timestamp','x','y']])
# 总共有 11 类【class1~class11】
for i in range(1,12):
    for name in file_names:
        cols.to_csv('../split_data/class'+str(i)+'/'+name,mode='w',header=False,index=False)

In [2]:
def update_class_info(labels,row_index):
    class_num = labels.iloc[row_index,0]
    start_time = labels.iloc[row_index,1]
    end_time = labels.iloc[row_index,2]
    return class_num,start_time,end_time

In [47]:
# label是切割文件DF，data是待切割的文件完整路径
def split_by_timestamp(labels,data_path,data_name):
    row_index = 12
    # 标签中的类别，开始时间与结束时间
    class_num,start_time,end_time = update_class_info(labels,row_index)
    # 块小一些比较保险，避免一个块跨越多个事件
    data_reader = pd.read_csv(data_path,chunksize=200)
    # 标志位
    flag_count = 0 
    for chunk in data_reader:
        # 当前块开始和结束时间
        start = chunk.iloc[0,0]
        end = chunk.iloc[-1,0]
        if start < start_time and end > start_time and end < end_time:
            # 寻找刚刚大于start_time的事件点开始写入
            real_index = 0
            while chunk.iloc[real_index,0] < start_time:
                real_index = real_index + 1
            if flag_count == 0:
                # 以index为分界线，全部数据都写入
                chunk.iloc[real_index:,:].to_csv(f'../split_data/class{class_num}/{data_name}',mode='a',header=False,index=False)
            else:
                chunk.iloc[real_index:,:].to_csv(f'../split_data/class{class_num}/{data_name[:-4]}{flag_count}.csv',mode='a',header=False,index=False)
        elif start >= start_time and end <= end_time :
            if flag_count == 0:
                # 以index为分界线，全部数据都写入
                chunk.to_csv(f'../split_data/class{class_num}/{data_name}',mode='a',header=False,index=False)
            else:
                chunk.to_csv(f'../split_data/class{class_num}/{data_name[:-4]}{flag_count}.csv',mode='a',header=False,index=False)  
        elif start > start_time and start < end_time and end > end_time:
            real_index = 0
            while chunk.iloc[real_index,0] < end_time:
                real_index = real_index + 1 
            if flag_count == 0:
                # 以index为分界线，全部数据都写入
                chunk.iloc[:real_index,:].to_csv(f'../split_data/class{class_num}/{data_name}',mode='a',header=False,index=False)
            else:
                chunk.iloc[:real_index,:].to_csv(f'../split_data/class{class_num}/{data_name[:-4]}{flag_count}.csv',mode='a',header=False,index=False)
            row_index = row_index + 1
            break
            # 超过标签索引，丢弃
            if row_index >= labels.shape[0]:
                break
            class_num,start_time,end_time = update_class_info(labels,row_index)
            # 说明出现了重复类
            if class_num == labels.iloc[row_index-1,0]:
                cols = pd.DataFrame(data=[['timestamp','x','y']])
                flag_count = flag_count + 1
                cols.to_csv(f'../split_data/class{class_num}/{data_name[:-4]}{flag_count}.csv',mode='w',header=False,index=False)
            else:
                # 初始化
                flag_count = 0 

In [15]:
# 同时遍历标签和数据的csv
# 标签每次读取一行，记录有class,start,end
# 数据每次读取chunk，如果时间点在[start,end]之间，则将对应csv文件存入class文件夹中
# label是标签的文件名,data_name是数据的文件名
for label_name,data_name in zip(labels_file_names,file_names):
    label_data = pd.read_csv('../labels_clean/'+label_name)
    split_by_timestamp(label_data,'../events_csv_clean/'+data_name,data_name)

In [44]:
# label_data = pd.read_csv('../labels_clean/user25_fluorescent_labels.csv')
# label_data2 = pd.read_csv('../labels_clean/user04_fluorescent_led_labels.csv')
# label_data3 = pd.read_csv('../labels_clean/user02_natural_labels.csv')
# label_data4 = pd.read_csv('../labels_clean/user14_fluorescent_led_labels.csv')
# label_data5 = pd.read_csv('../labels_clean/user01_natural_labels.csv')
# label_data6 = pd.read_csv('../labels_clean/user07_led_labels.csv')
# label_data7 = pd.read_csv('../labels_clean/user09_fluorescent_labels.csv')
# label_data8 = pd.read_csv('../labels_clean/user24_fluorescent_led_labels.csv')
# label_data9 = pd.read_csv('../labels_clean/user22_led_labels.csv')
label_data10 = pd.read_csv('../labels_clean/user12_fluorescent_led_labels.csv')

In [46]:
# split_by_timestamp(label_data,'../events_csv_clean/user25_fluorescent.csv','user25_fluorescent.csv')
# split_by_timestamp(label_data2,'../events_csv_clean/user04_fluorescent_led.csv','user04_fluorescent_led.csv')
# split_by_timestamp(label_data3,'../events_csv_clean/user02_natural.csv','user02_natural.csv')
# split_by_timestamp(label_data4,'../events_csv_clean/user14_fluorescent_led.csv','user14_fluorescent_led.csv')
# split_by_timestamp(label_data5,'../events_csv_clean/user01_natural.csv','user01_natural.csv')
# split_by_timestamp(label_data6,'../events_csv_clean/user07_led.csv','user07_led.csv')
# split_by_timestamp(label_data7,'../events_csv_clean/user09_fluorescent.csv','user09_fluorescent.csv')
# split_by_timestamp(label_data8,'../events_csv_clean/user24_fluorescent_led.csv','user24_fluorescent_led.csv')
# split_by_timestamp(label_data9,'../events_csv_clean/user22_led.csv','user22_led.csv')
split_by_timestamp(label_data10,'../events_csv_clean/user12_fluorescent_led.csv','user12_fluorescent_led.csv')