In [1]:
import math
import os
import os.path
import numpy as np
import pandas as pd
from scipy import interpolate

In [2]:
p = np.array(['a', 'b', 'c']).flatten()
q = np.array([1, 2, 3, 4]).flatten()
pq = np.concatenate((p, q))
print(pq)
print(pq.shape)
df = pd.DataFrame(pq.reshape(1, 7), columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7'])
print(df)

['a' 'b' 'c' '1' '2' '3' '4']
(7,)
  c1 c2 c3 c4 c5 c6 c7
0  a  b  c  1  2  3  4


In [3]:
df1 = pd.DataFrame([[1, 2, 3, 'hello', 'world']], columns=['c1', 'c2', 'c3', 'c4', 'c5'])
df2 = pd.DataFrame([[1, 1, 13, 'Ahello', 'Aworld']], columns=['c1', 'c2', 'c3', 'c6', 'c7'])
print(df1)
print(df2)
print("==============================\n\n")
df = pd.concat([df1, df2])
print(df)
df = df.reset_index()

   c1  c2  c3     c4     c5
0   1   2   3  hello  world
   c1  c2  c3      c6      c7
0   1   1  13  Ahello  Aworld


   c1  c2  c3     c4     c5      c6      c7
0   1   2   3  hello  world     NaN     NaN
0   1   1  13    NaN    NaN  Ahello  Aworld


In [4]:
d = {'time': np.array([29.0, 30.0, 34.0, 31.0, 32.0, 23.0, 25.0, 33.0, 34.5, 35.0, 36.0], dtype=np.double), 
     'xxx': np.arange(1, 12, dtype=np.double),
     'plot': np.array([1, 2, 3, 2, 2, 1, 1, 3, 3, 2, 2], dtype=np.int)}
df = pd.DataFrame(d, columns=['time', 'plot', 'xxx'])
df

Unnamed: 0,time,plot,xxx
0,29.0,1,1.0
1,30.0,2,2.0
2,34.0,3,3.0
3,31.0,2,4.0
4,32.0,2,5.0
5,23.0,1,6.0
6,25.0,1,7.0
7,33.0,3,8.0
8,34.5,3,9.0
9,35.0,2,10.0


In [5]:
sdf = df.sort_values('time', axis=0)
sdf.index = np.arange(sdf.shape[0])
sdf

Unnamed: 0,time,plot,xxx
0,23.0,1,6.0
1,25.0,1,7.0
2,29.0,1,1.0
3,30.0,2,2.0
4,31.0,2,4.0
5,32.0,2,5.0
6,33.0,3,8.0
7,34.0,3,3.0
8,34.5,3,9.0
9,35.0,2,10.0


In [6]:
diffs = np.zeros(sdf.shape[0], dtype=np.int)
diffs[1:] = np.abs(np.diff(sdf['plot'].values))
diffs[0] = 1
sdf['cluster_id'] = np.cumsum(diffs)
sdf

Unnamed: 0,time,plot,xxx,cluster_id
0,23.0,1,6.0,1
1,25.0,1,7.0,1
2,29.0,1,1.0,1
3,30.0,2,2.0,2
4,31.0,2,4.0,2
5,32.0,2,5.0,2
6,33.0,3,8.0,3
7,34.0,3,3.0,3
8,34.5,3,9.0,3
9,35.0,2,10.0,4


In [7]:
clusters = sdf.groupby('cluster_id')
for cluster in clusters:
    print(cluster[1])
    print("\n\n")

   time  plot  xxx  cluster_id
0  23.0     1  6.0           1
1  25.0     1  7.0           1
2  29.0     1  1.0           1



   time  plot  xxx  cluster_id
3  30.0     2  2.0           2
4  31.0     2  4.0           2
5  32.0     2  5.0           2



   time  plot  xxx  cluster_id
6  33.0     3  8.0           3
7  34.0     3  3.0           3
8  34.5     3  9.0           3



    time  plot   xxx  cluster_id
9   35.0     2  10.0           4
10  36.0     2  11.0           4





In [8]:
values = np.array([22.0, 23.0, 24.5, 29.5, 30.0, 31.0, 32.5, 34.25, 34.5, 35.0, 36.0, 37.0])
ndf = pd.DataFrame({'x': values, 
                    'y': np.zeros(values.size, dtype=np.double),
                    'p': np.zeros(values.size, dtype=np.int)}, 
                   columns=['x', 'y', 'p'])
print(ndf)
print("---------------------------------------")
ndfy = ndf['y'].values
ndfp = ndf['p'].values
for cluster in clusters:
    print("plot #: {}".format(cluster[1]['plot'].unique()[0]))
    f = interpolate.interp1d(cluster[1]['time'].values, 
                             cluster[1]['xxx'].values, 
                             bounds_error=False,
                             fill_value=0.0)
    interps = f(values) 
    ndfy += interps
    ndfp[np.where(np.abs(interps) > 0.0)] = cluster[1]['plot'].unique()[0]
    print(cluster[1])
    print(np.column_stack((values, ndfy, ndfp)))
    print(ndfy)
    print("\n\n\n")
ndf['y'] = ndfy
ndf['p'] = ndfp
ndf = ndf[ndf['p'] > 0]
print(ndf)

# nndf = ndf[ndf['y'].abs()]

        x    y  p
0   22.00  0.0  0
1   23.00  0.0  0
2   24.50  0.0  0
3   29.50  0.0  0
4   30.00  0.0  0
5   31.00  0.0  0
6   32.50  0.0  0
7   34.25  0.0  0
8   34.50  0.0  0
9   35.00  0.0  0
10  36.00  0.0  0
11  37.00  0.0  0
---------------------------------------
plot #: 1
   time  plot  xxx  cluster_id
0  23.0     1  6.0           1
1  25.0     1  7.0           1
2  29.0     1  1.0           1
[[ 22.     0.     0.  ]
 [ 23.     6.     1.  ]
 [ 24.5    6.75   1.  ]
 [ 29.5    0.     0.  ]
 [ 30.     0.     0.  ]
 [ 31.     0.     0.  ]
 [ 32.5    0.     0.  ]
 [ 34.25   0.     0.  ]
 [ 34.5    0.     0.  ]
 [ 35.     0.     0.  ]
 [ 36.     0.     0.  ]
 [ 37.     0.     0.  ]]
[ 0.    6.    6.75  0.    0.    0.    0.    0.    0.    0.    0.    0.  ]




plot #: 2
   time  plot  xxx  cluster_id
3  30.0     2  2.0           2
4  31.0     2  4.0           2
5  32.0     2  5.0           2
[[ 22.     0.     0.  ]
 [ 23.     6.     1.  ]
 [ 24.5    6.75   1.  ]
 [ 29.5    0.     0

In [9]:
sdf.columns

Index(['time', 'plot', 'xxx', 'cluster_id'], dtype='object')

In [10]:
sdf[['cluster_id', 'xxx']]

Unnamed: 0,cluster_id,xxx
0,1,6.0
1,1,7.0
2,1,1.0
3,2,2.0
4,2,4.0
5,2,5.0
6,3,8.0
7,3,3.0
8,3,9.0
9,4,10.0


In [11]:
(10*60 + 14)*60 + 56

36896