In [3]:
import glob
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
%matplotlib inline

In [8]:
class dataset:

    def _get_dir_list(self, path):
        return glob.glob(path)

    def _get_file_list_in_dir(self, d):
        f = glob.glob(d + '/*.csv')
        f.sort()
        return f

    def _get_df_from_csv(self, f):
        is_iOS = "iphone" in f
        df = pd.read_csv(f)

        if is_iOS:
            # rename column name to integrate
            df.rename(columns={'rssi': 'remote_rssi'}, inplace=True)
            # remove unsaved parameters
            del df['magneticField.accuracy']
            del df['magneticField.field.x']
            del df['magneticField.field.y']
            del df['magneticField.field.z']
        else:
            # rename column name to integrate
            df.rename(columns={'rssi(A)': 'rssi.a'}, inplace=True)
            df.rename(columns={'rssi(B)': 'rssi.b'}, inplace=True)
            # remove unsaved parameters
            del df['temperature']
            del df['humidity']
            del df['proximity']

        # remove data while rssi is not recorded
        buf = np.where(df['rssi.a'].as_matrix() != -1)[0]
        if len(buf) == 0:
            print('x', end='')
            return None
        first_rssi_idx = buf[0]
        df = df.ix[first_rssi_idx:, :]

        # adjust the value of button pushed data 
        df.loc[df['btn_push'] == -1, 'btn_push'] = 0

        # convert -1 to NaN
        df[df == -1] = np.nan

        # interpolate NaN
        df = df.apply(pd.Series.interpolate)

        print('.', end='')

        return df

    def __init__(self, path='data/raw/*_*_0[1-4]*'):
        self.path = path
        print("path = " + path)
        d = self._get_dir_list(path)
        title = [os.path.basename(_d) for _d in d]

        r = []
        for _d in d[:2]:
            print(_d)
            title = os.path.basename(_d)
            file_list = self._get_file_list_in_dir(_d)
            fname_list = [os.path.basename(_f) for _f in file_list]
            df_list = [self._get_df_from_csv(f) for f in file_list]
            print('')
            r.extend([[title, fn, df] for (fn, df) in zip(fname_list, df_list)])

        self.df_master = pd.DataFrame(r, columns=('title', 'fname', 'dataframe'))
    
    def _time_range_around_button_push(self, df, before=0, after=0):
        t = df[df['btn_push'] == 1].index
        point = t[0] if len(t) > 0 else df.index[-1] - 150
        return df.loc[(point - before):(point + after)]

    def get_data(self, title='.*', fname='.*', before=None, after=0, column=None):
        # extract the [title, fname, dataframe]
        dfm = self.df_master[self.df_master['title'].str.contains(title)]
        dfm = dfm[dfm['fname'].str.contains(fname)]
        # to list of dataframe
        dfml = [d for d in dfm['dataframe'].tolist() if d is not None]
        # limit the time range
        dfml = [self._time_range_around_button_push(d, before, after) for d in dfml]
        # limit the column
        if column != None: dfml = [d[column] for d in dfml]
        print("extracted %d dataframes" % (len(dfml)))
        return dfml

In [12]:
ds = dataset('data/raw/01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅')
ds.df_master['dataframe'][0].head()

path = data/raw/01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅
data/raw/01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅
...x..........................


Unnamed: 0,date,rssi.a,rssi.b,remote_rssi,linear_accel[0],linear_accel[1],linear_accel[2],accelerometer[0],accelerometer[1],accelerometer[2],...,game_rotation[0],game_rotation[1],game_rotation[2],game_rotation[3],magnetic[0],magnetic[1],magnetic[2],pressure,light,btn_push
91,2017/04/06 14:22:23.334,-62.0,-62.0,-81.0,0.107452,0.013687,-0.266391,0.587601,2.197723,9.080734,...,0.096745,0.062251,0.681178,0.723023,18.890984,3.055763,-18.23616,,,0.0
92,2017/04/06 14:22:23.354,-66.333333,-63.428571,-81.0,0.149785,0.028881,-0.196661,0.700928,2.234436,9.088989,...,0.096946,0.062199,0.681327,0.722859,18.890984,3.055763,-18.32409,1017.94055,,0.0
93,2017/04/06 14:22:23.376,-70.666667,-64.857143,-81.0,0.192119,0.044076,-0.126932,0.554733,2.21759,9.249832,...,0.09719,0.06188,0.681452,0.722737,18.709404,3.328323,-18.412018,1017.945525,,0.0
94,2017/04/06 14:22:23.395,-75.0,-66.285714,-72.0,-0.086707,0.132139,0.141152,0.397614,2.316269,9.46373,...,0.09719,0.061788,0.681132,0.723046,18.709404,3.419304,-18.587875,1017.9505,110.0,0.0
95,2017/04/06 14:22:23.415,-70.5,-67.714286,-72.0,-0.02378,0.118709,0.021171,0.336319,2.278671,9.353012,...,0.097196,0.062068,0.681002,0.723144,18.709404,3.328514,-18.412018,1017.9505,109.5,0.0


In [11]:
ds.df_master.iloc[0]['dataframe'].info()
ds.df_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 91 to 794
Data columns (total 31 columns):
date                704 non-null object
rssi.a              704 non-null float64
rssi.b              704 non-null float64
remote_rssi         704 non-null float64
linear_accel[0]     704 non-null float64
linear_accel[1]     704 non-null float64
linear_accel[2]     704 non-null float64
accelerometer[0]    704 non-null float64
accelerometer[1]    704 non-null float64
accelerometer[2]    704 non-null float64
gravity[0]          704 non-null float64
gravity[1]          704 non-null float64
gravity[2]          704 non-null float64
gyro[0]             704 non-null float64
gyro[1]             704 non-null float64
gyro[2]             704 non-null float64
rotation[0]         704 non-null float64
rotation[1]         704 non-null float64
rotation[2]         704 non-null float64
rotation[3]         704 non-null float64
rotation[4]         704 non-null float64
game_rotation[0]    704 non-null f

In [36]:
dfl = ds.get_data(before=6, after=0, column=['rssi.b','rssi.a','linear_accel[0]','linear_accel[1]','linear_accel[2]','btn_push'])
dfl[0]

extracted 29 dataframes


Unnamed: 0,rssi.b,rssi.a,linear_accel[0],linear_accel[1],linear_accel[2],btn_push
692,-45.0,-38.0,0.231796,-1.859742,0.484468,0.0
693,-45.0,-39.0,0.044913,-1.579473,0.463062,0.0
694,-45.0,-39.5,-0.057039,-1.348034,0.487368,0.0
695,-45.5,-40.0,-0.14449,-1.16739,0.478381,0.0
696,-46.0,-41.0,-0.049868,-0.897218,0.383633,0.0
697,-48.0,-42.0,0.106869,-0.743984,0.294303,0.0
698,-50.0,-40.5,0.266572,-0.615294,0.18712,1.0


In [47]:
import math
def distance_series(df, x, y, z):
    x = df[x]
    y = df[y]
    z = df[z]
    return (x*x + y*y + z*z).apply(lambda x: math.sqrt(x))

def get_mean_std(dfl, x, y, z):
    dist_s = [distance_series(df, x, y, z) for df in dfl]
    s = pd.concat(dist_s)
    std = s.std()
    x_mean = pd.concat([df[x] for df in dfl]).mean()
    y_mean = pd.concat([df[y] for df in dfl]).mean()
    z_mean = pd.concat([df[z] for df in dfl]).mean()
    return (x_mean, y_mean, z_mean, std)

get_mean_std(dfl[:3], 'linear_accel[0]', 'linear_accel[1]', 'linear_accel[2]')

(-0.033050264476190451,
 -1.3138882119047619,
 0.52129487428571419,
 0.46199453180907996)

In [None]:
x.

In [297]:
# sample for data ananlysis
ds = dataset('data/raw/01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅')

data/raw/01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅
...x..........................


In [301]:
dfl = ds.get_data(title='01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅',
                   before=5, after=0, column=['linear_accel[0]','linear_accel[1]','linear_accel[2]'])[0:3]

extracted 29 dataframes


In [295]:
def _norm(df, columns):
    for c in columns:
        if type(c) == list:
            s = sum([df[_c] for _c in c])
            ave = s.mean()
            
        else:
            s = df[c]
        print(s.mean())
        print(s.std())

def _list_flatten(l):
    _l = []
    for l in l:
        if type(l) == list: _l.extend(l)
        else:               _l.append(l)
    return _l

def _get_data(dfm, title='.*', fname='.*', before=0, after=0, column=None):
    # extract the [title, fname, dataframe]
    dfm = dfm[dfm['title'].str.contains(title)]
    dfm = dfm[dfm['fname'].str.contains(fname)]
    # to list of dataframe
    dfml = [d for d in dfm['dataframe'].tolist() if d is not None]
    # limit the time range
    dfml = [ds._time_range_around_button_push(d, before, after) for d in dfml]
    # flatten the specified columns
    print(column)
    c_flat = _list_flatten(column)
    # limit the column
    if column != None: dfml = [d[c_flat] for d in dfml]
    
    # normalize data
    _norm(dfml[0], column)
    print("extracted %d dataframes" % (len(dfml)))
    return dfml

_get_data(ds.df_master, title='01_01_02_4F実験室_XperiaZ3_カバン_裏上_正常_まっすぐ帰宅',
                 before=5, after=0, column=[['linear_accel[0]','linear_accel[1]'],'linear_accel[2]'])[0]

[['linear_accel[0]', 'linear_accel[1]'], 'linear_accel[2]']
-1.030739209
0.469059807658
0.382311265
0.120526993133
extracted 29 dataframes


Unnamed: 0,linear_accel[0],linear_accel[1],linear_accel[2]
693,0.044913,-1.579473,0.463062
694,-0.057039,-1.348034,0.487368
695,-0.14449,-1.16739,0.478381
696,-0.049868,-0.897218,0.383633
697,0.106869,-0.743984,0.294303
698,0.266572,-0.615294,0.18712


In [292]:
l = [['linear_accel[0]', 'linear_accel[1]'], 'linear_accel[2]']
_l = []
for l in l:
    if type(l) == list: _l.extend(l)
    else:               _l.append(l)
_l

['linear_accel[0]', 'linear_accel[1]', 'linear_accel[2]']

In [256]:
(2*df['linear_accel[0]'] + df['linear_accel[1]']).std()

0.58730733561568127

In [223]:
########################## k-NN 
from sklearn.neighbors import NearestNeighbors
#X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
X = normal
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')
nbrs.fit(X)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [224]:
distances, indices = nbrs.kneighbors(X)
print(distances)
print(indices)

[[ 0.          0.96900979  1.18613318]
 [ 0.          0.47547992  0.56683676]
 [ 0.          0.95248595  0.97963222]
 [ 0.          0.70762178  0.87519611]
 [ 0.          0.62291934  0.73165756]
 [ 0.          0.62291934  0.96900979]
 [ 0.          0.99479435  1.00146631]
 [ 0.          0.56683676  0.6744973 ]
 [ 0.          0.7166177   0.80090115]
 [ 0.          0.47547992  0.6744973 ]]
[[0 5 4]
 [1 9 7]
 [2 9 8]
 [3 7 1]
 [4 5 1]
 [5 4 0]
 [6 5 4]
 [7 1 9]
 [8 7 1]
 [9 1 7]]
