# 曼哈顿压缩

In [2]:
# 计算两个坐标的曼哈顿距离
def compute_distance(x1,y1,x2,y2):
    return abs(x1-x2),abs(y1-y2)
# 通过与下一个事件点比较从而判断当前事件点是否为噪声
def is_noise(row,next_event,delta):
    # 当前点的下一个点距离太远且count==1，则当前点被认为是噪声点
    x_dis,y_dis = compute_distance(row[1],row[2],next_event[1],next_event[2])
    # 当前事件点是噪声，跳过不录入
    if x_dis > delta or y_dis > delta:
        # 是噪声点
        return True
    else:
        return False
# 将聚合点写入CSV文件
def write_to_csv(temp_x,temp_y,count,path):
    pd.DataFrame(data=[[temp_x/count,temp_y/count]]).to_csv(path,
                                                   mode='a',header=False,index=False)
# 选择聚类起始点
def select_start_point(df,delta):
    for index,row in df.iterrows():
        d1,d2 = compute_distance(row[1],row[2],df['x'][index+1],df['y'][index+1])
        if d1 <= delta and d2 < delta:
            # 说明当前点周围有其他点，大概率不是噪声点
            return index,row      
# 曼哈顿压缩
def compress_by_Manhattan(file_name,class_num,delta=5,count_margin=100,nature_flag=True):
    file_path = None
    to_file_path = None
    # 自然数据路径
    if nature_flag:
        file_path = f'../../event_csv/split_data/class{class_num}/{file_name}'
        to_file_path = f'../../event_csv/compress_event_manhattan/class{class_num}/{file_name}'
    # 人工合成数据路径
    else:
        file_path = f'../../event_csv/split_data/artificial/{file_name}'
        to_file_path = f'../../event_csv/compress_event_manhattan/articicial/{file_name}'  
    
#     df = pd.read_csv(file_path,skiprows=1)
    df = pd.read_csv(file_path)
    # 曼哈顿距离比较对象【最开始的点可能是噪声点】
    start,row = select_start_point(df,delta)
    # row[0]是时间戳列
    baseline_x = row[1]
    baseline_y = row[2]
    
    # 待写入的曼哈顿距离
    temp_x = 0
    temp_y = 0
    count = 0
    pd.DataFrame(data=[['x','y']]).to_csv(to_file_path,mode='w',header=False,index=False)
    # 遍历每一行
    for index,row in df.iterrows():
        if index < start:
            continue
        # 在曼哈顿距离内，聚为一个点，取平均值
        x_dis,y_dis = compute_distance(baseline_x,baseline_y,row[1],row[2])
        # 最多聚焦 count_margin 个点
        if x_dis <= delta and y_dis <= delta and count < count_margin:
            # 平均值的分母
            count = count + 1
            temp_x = temp_x + row[1]
            temp_y = temp_y + row[2]
        else:
            # 来到最后一个事件点位置
            if index + 1 >= df.shape[0]:
                if count > 1:
                    write_to_csv(temp_x,temp_y,count,to_file_path)
                break
            next_event = df.iloc[index+1]
            # 当前点的下一个点距离太远且count==1，则当前点被认为是噪声点
            if count == 1 and is_noise(row,next_event,delta):
                # 当前事件点是噪声，跳过不录入
                continue
            write_to_csv(temp_x,temp_y,count,to_file_path)
            baseline_x = row[1]
            baseline_y = row[2]
            temp_x = row[1]
            temp_y = row[2]
            count = 1

# PCA

In [3]:
def dimensionality_reduction_PCA(file_name,class_num,nature_flag=True):
    df = None
    to_file_path = None
    if nature_flag:
        # 时空过滤之后的数据
        df = pd.read_csv(f'../../event_csv/compress_event_manhattan/class{class_num}/{file_name}')
        to_file_path = f'../../event_csv/compress_event_manhattan/class{class_num}/smooth_by_pca/{file_name}'
    else:
        df = pd.read_csv(f'../../event_csv/compress_event_manhattan/articicial/{file_name}')
        to_file_path = f'../../event_csv/compress_event_manhattan/articicial/smooth_by_pca/{file_name}'
    # PCA主成分分析，只要第一维
    data = PCA_method(df)
    pd.DataFrame(data,columns=['value']).to_csv(to_file_path,mode='w',header=True,index=False)
    

# 均值压缩

In [3]:
def compress_by_mean(file_name,class_num,chunksize=100,nature_flag=True):
    file_path = None
    to_file_path = None
    if nature_flag:
        file_path = f'../../event_csv/compress_event_manhattan/class{class_num}/smooth_by_pca/{file_name}'
        to_file_path = f'../../event_csv/compress_event_manhattan/class{class_num}/smooth_by_pca/compress_by_mean/{file_name}'
    else:
        # 人工合成数据路径
        file_path = f'../../event_csv/compress_event_manhattan/articicial/smooth_by_pca/{file_name}'
        to_file_path = f'../../event_csv/compress_event_manhattan/articicial/smooth_by_pca/compress_by_mean/{file_name}'

    df = pd.read_csv(file_path,chunksize=chunksize,usecols=['value'])
    pd.DataFrame(data=[['value']]).to_csv(to_file_path,mode='w',header=False,index=False)
    for chunk in df:
        temp = pd.DataFrame([chunk.mean()])
        temp.to_csv(to_file_path,index=False,header=False,mode='a')
    

# 二者整合

In [1]:
# 波形平滑
def distance_mean_meanline(file_name,class_num=-1,delta=5,count=100,mean_count=100,nature_flag=True):
    # 曼哈顿距离压缩
    compress_by_Manhattan(file_name,class_num,delta=delta,count_margin=count,nature_flag=nature_flag)
    # PCA + HP滤波
    dimensionality_reduction_PCA(file_name,class_num,nature_flag=nature_flag)
    # 平均值压缩
    compress_by_mean(file_name,class_num,chunksize=mean_count,nature_flag=nature_flag)