In [42]:
%%HTML
<script>
    code_show=true; 
    function code_toggle() {
        if (code_show) { $('div.input').hide(); }
        else { $('div.input').show(); }
        code_show = !code_show
    } 
$( document ).ready(code_toggle);
</script>

<form action="javascript:code_toggle()">
    <b><input type="submit" value="单击此处切换代码显示"></b>
</form>

# 一次性剔除数据的趋势项 

## 原始数据
时长500秒

In [43]:
from os import listdir, path

folder = 'tdms_files'
file_list = [path.join(folder, file_name) for file_name in listdir(folder)][0:50]

In [44]:
from read_tdms_file import read_tdms_files

channel_list = [u'NLHQ-X-03-S08', u'NLHQ-X-03-S09']
raw_data = read_tdms_files(file_list, channel_list)

In [45]:
import pandas as pd

pd.options.display.max_rows = 20
raw_data

Unnamed: 0,NLHQ-X-03-S08,NLHQ-X-03-S09
2018-03-11 10:00:00.000,219.117111,-828.362610
2018-03-11 10:00:00.020,219.414963,-828.970520
2018-03-11 10:00:00.040,219.210724,-828.194641
2018-03-11 10:00:00.060,218.844788,-828.614563
2018-03-11 10:00:00.080,218.921371,-827.882690
2018-03-11 10:00:00.100,218.129944,-827.966675
2018-03-11 10:00:00.120,218.942657,-828.466614
2018-03-11 10:00:00.140,218.972443,-827.926636
2018-03-11 10:00:00.160,218.712875,-828.346619
2018-03-11 10:00:00.180,219.397934,-828.506592


In [46]:
# 定义绘制Pandas DataFrame曲线的函数
from pyecharts import Line

def dataframe_line_plot(title, df):
    
    option = {
        'title': {
            'text': title,
            'left': 'center'
        },
        'legend': {
            'show': True,
            'orient': 'vertical',
            'right': '9%',
            'top': '3%'
        },
        'xAxis': {
            'type': 'time',
            'axisLine': {
                'onZero': False
            }
        },
        'yAxis': {
            'axisLine': {
                'onZero': False
            }
        },
        'dataZoom': [
            {
                'type': 'slider',
                'xAxisIndex': 0,
                'start': 0,
                'end': 50
            }
        ]
    }

    time = [ts.timestamp()/0.001 for ts in df.index]
    series = []
    for channel in df.columns:
        values = df[channel].values.tolist()
        data = [(t, x) for t, x in zip(time, values)]
        series.append(
            {
                'type': 'line',
                'showSymbol': False,
                'name': channel,
                'data': data
            }
        )
    option.update({'series': series})
        
    line_plot = Line()
    line_plot._option.update(option)
    return line_plot

In [47]:
line_plot = dataframe_line_plot('原始数据绘图', raw_data)
line_plot

## 减采样（周期=5秒）并计算新周期内的众数
固定时间区间内的众数较好地反映了趋势项，但是时间区间的宽度必须合适，区间太宽不能认为该区间中趋势保持不变，区间太窄可能导致众数不能代表趋势。

In [48]:
from scipy.stats import mode

mode_5s = raw_data.resample('5s', closed='right', label='right').apply(lambda x: mode(x)[0][0])

In [49]:
mode_5s

Unnamed: 0,NLHQ-X-03-S08,NLHQ-X-03-S09
2018-03-11 10:00:00,219.117111,-828.362610
2018-03-11 10:00:05,218.989456,-828.602539
2018-03-11 10:00:10,218.525665,-828.254639
2018-03-11 10:00:15,219.163910,-827.570679
2018-03-11 10:00:20,218.538422,-829.802429
2018-03-11 10:00:25,218.861801,-827.746704
2018-03-11 10:00:30,219.210724,-828.586548
2018-03-11 10:00:35,219.636215,-828.614563
2018-03-11 10:00:40,219.146896,-828.482605
2018-03-11 10:00:45,218.878830,-828.894531


In [50]:
line_plot = dataframe_line_plot('5秒内众数绘图', mode_5s)
line_plot

## 计算5秒众数的滑动平均值
为进一步平滑趋势项，将每个区间及其左右相邻区间的众数的平均值作为该区间的趋势项。

In [51]:
mode_5s = mode_5s.rolling(3, center=True, min_periods=1).mean()

In [52]:
mode_5s

Unnamed: 0,NLHQ-X-03-S08,NLHQ-X-03-S09
2018-03-11 10:00:00,219.053284,-828.482574
2018-03-11 10:00:05,218.877411,-828.406596
2018-03-11 10:00:10,218.893010,-828.142619
2018-03-11 10:00:15,218.742666,-828.542582
2018-03-11 10:00:20,218.854711,-828.373271
2018-03-11 10:00:25,218.870316,-828.711894
2018-03-11 10:00:30,219.236247,-828.315938
2018-03-11 10:00:35,219.331278,-828.561239
2018-03-11 10:00:40,219.220647,-828.663900
2018-03-11 10:00:45,218.962509,-828.822550


In [53]:
line_plot = dataframe_line_plot('众数滑动平均值绘图', mode_5s)
line_plot

## 提出趋势项后的数据
首先对上一步获取的趋势项增采样（周期=0.02秒），然后删除最后一项以保证趋势项数据与原数据长度相同。

In [54]:
trend = mode_5s.resample('20ms').interpolate()
trend = trend[:-1]
detrended_data = raw_data - trend

In [55]:
line_plot = dataframe_line_plot('剔除趋势项的数据绘图', detrended_data)
line_plot

# 剔除乱序到来数据包的趋势项

## 3个原始数据包
**注意**：最后一个数据包最先到达

In [56]:
from read_tdms_file import read_tdms_file

df1 = read_tdms_file(file_list[2], channel_list=channel_list)
df2 = read_tdms_file(file_list[0], channel_list=channel_list)
df3 = read_tdms_file(file_list[1], channel_list=channel_list)

In [57]:
df1

Unnamed: 0,NLHQ-X-03-S08,NLHQ-X-03-S09
2018-03-11 10:00:10.000,219.712814,-827.826660
2018-03-11 10:00:10.020,219.427719,-827.842651
2018-03-11 10:00:10.040,219.036255,-827.706665
2018-03-11 10:00:10.060,219.334106,-828.246643
2018-03-11 10:00:10.080,219.466019,-828.510559
2018-03-11 10:00:10.100,219.810669,-828.986511
2018-03-11 10:00:10.120,218.768188,-829.130493
2018-03-11 10:00:10.140,218.955414,-828.614563
2018-03-11 10:00:10.160,218.751175,-828.898499
2018-03-11 10:00:10.180,219.763870,-827.890686


In [58]:
df2

Unnamed: 0,NLHQ-X-03-S08,NLHQ-X-03-S09
2018-03-11 10:00:00.000,219.117111,-828.362610
2018-03-11 10:00:00.020,219.414963,-828.970520
2018-03-11 10:00:00.040,219.210724,-828.194641
2018-03-11 10:00:00.060,218.844788,-828.614563
2018-03-11 10:00:00.080,218.921371,-827.882690
2018-03-11 10:00:00.100,218.129944,-827.966675
2018-03-11 10:00:00.120,218.942657,-828.466614
2018-03-11 10:00:00.140,218.972443,-827.926636
2018-03-11 10:00:00.160,218.712875,-828.346619
2018-03-11 10:00:00.180,219.397934,-828.506592


In [59]:
df3

Unnamed: 0,NLHQ-X-03-S08,NLHQ-X-03-S09
2018-03-11 10:00:05.000,219.414963,-828.202637
2018-03-11 10:00:05.020,218.844788,-828.134644
2018-03-11 10:00:05.040,220.172348,-827.182739
2018-03-11 10:00:05.060,219.512817,-827.010803
2018-03-11 10:00:05.080,219.648987,-827.882690
2018-03-11 10:00:05.100,219.759613,-828.258606
2018-03-11 10:00:05.120,219.180939,-829.014526
2018-03-11 10:00:05.140,219.066040,-829.358459
2018-03-11 10:00:05.160,218.376740,-829.222473
2018-03-11 10:00:05.180,218.495880,-829.298462


In [60]:
# 定义Pandas DataFrame重组的函数
def convert_df(df):
    index = pd.MultiIndex.from_product([[df.index[0]], df.index], names=['start_time', 'timestamp'])
    df = pd.DataFrame(data=df.values, index=index, columns=df.columns)
    return df

## 组合为多重索引数据包
* 第一层索引为数据包的起始时间
* 第二层索引为每个数据点对应的时间

**注意：**再组合的过程中已对数据包进行升序排列

In [61]:
df1 = convert_df(df1)
df2 = convert_df(df2)
df3 = convert_df(df3)

df4 = df1.combine_first(df2).combine_first(df3)
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,NLHQ-X-03-S08,NLHQ-X-03-S09
start_time,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-11 10:00:00,2018-03-11 10:00:00.000,219.117111,-828.362610
2018-03-11 10:00:00,2018-03-11 10:00:00.020,219.414963,-828.970520
2018-03-11 10:00:00,2018-03-11 10:00:00.040,219.210724,-828.194641
2018-03-11 10:00:00,2018-03-11 10:00:00.060,218.844788,-828.614563
2018-03-11 10:00:00,2018-03-11 10:00:00.080,218.921371,-827.882690
2018-03-11 10:00:00,2018-03-11 10:00:00.100,218.129944,-827.966675
2018-03-11 10:00:00,2018-03-11 10:00:00.120,218.942657,-828.466614
2018-03-11 10:00:00,2018-03-11 10:00:00.140,218.972443,-827.926636
2018-03-11 10:00:00,2018-03-11 10:00:00.160,218.712875,-828.346619
2018-03-11 10:00:00,2018-03-11 10:00:00.180,219.397934,-828.506592


### 获取第一层索引值的无重复列表

In [62]:
start_time_index = df4.index.get_level_values(level='start_time')
unique_values = start_time_index.unique()
unique_values.format()

['2018-03-11 10:00:00', '2018-03-11 10:00:05', '2018-03-11 10:00:10']

### 通过第一层索引值无重复列表的各个项可获得原始数据包

In [63]:
df4.loc[unique_values[0]]

Unnamed: 0_level_0,NLHQ-X-03-S08,NLHQ-X-03-S09
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-03-11 10:00:00.000,219.117111,-828.362610
2018-03-11 10:00:00.020,219.414963,-828.970520
2018-03-11 10:00:00.040,219.210724,-828.194641
2018-03-11 10:00:00.060,218.844788,-828.614563
2018-03-11 10:00:00.080,218.921371,-827.882690
2018-03-11 10:00:00.100,218.129944,-827.966675
2018-03-11 10:00:00.120,218.942657,-828.466614
2018-03-11 10:00:00.140,218.972443,-827.926636
2018-03-11 10:00:00.160,218.712875,-828.346619
2018-03-11 10:00:00.180,219.397934,-828.506592


### 从第一个数据包减除所有数据包的众数

In [64]:
df4.loc[unique_values[0]]-df4.mode().loc[0].values

Unnamed: 0_level_0,NLHQ-X-03-S08,NLHQ-X-03-S09
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-03-11 10:00:00.000,0.127655,-1.099854
2018-03-11 10:00:00.020,0.425507,-1.707764
2018-03-11 10:00:00.040,0.221268,-0.931885
2018-03-11 10:00:00.060,-0.144669,-1.351807
2018-03-11 10:00:00.080,-0.068085,-0.619934
2018-03-11 10:00:00.100,-0.859512,-0.703918
2018-03-11 10:00:00.120,-0.046799,-1.203857
2018-03-11 10:00:00.140,-0.017014,-0.663879
2018-03-11 10:00:00.160,-0.276581,-1.083862
2018-03-11 10:00:00.180,0.408478,-1.243835


### 从组合数据包中删除第一个数据包

In [65]:
df4.drop(index=unique_values[0], level='start_time')

Unnamed: 0_level_0,Unnamed: 1_level_0,NLHQ-X-03-S08,NLHQ-X-03-S09
start_time,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-11 10:00:05,2018-03-11 10:00:05.000,219.414963,-828.202637
2018-03-11 10:00:05,2018-03-11 10:00:05.020,218.844788,-828.134644
2018-03-11 10:00:05,2018-03-11 10:00:05.040,220.172348,-827.182739
2018-03-11 10:00:05,2018-03-11 10:00:05.060,219.512817,-827.010803
2018-03-11 10:00:05,2018-03-11 10:00:05.080,219.648987,-827.882690
2018-03-11 10:00:05,2018-03-11 10:00:05.100,219.759613,-828.258606
2018-03-11 10:00:05,2018-03-11 10:00:05.120,219.180939,-829.014526
2018-03-11 10:00:05,2018-03-11 10:00:05.140,219.066040,-829.358459
2018-03-11 10:00:05,2018-03-11 10:00:05.160,218.376740,-829.222473
2018-03-11 10:00:05,2018-03-11 10:00:05.180,218.495880,-829.298462


### 从索引值无重复列表中删除一个元素

In [66]:
unique_values.delete(0).format()

['2018-03-11 10:00:05', '2018-03-11 10:00:10']