In [45]:
import numpy as np
import pandas as pd
import json

from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import type_metric, distance_metric
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer


from sklearn.metrics.cluster import adjusted_rand_score

import matplotlib.pyplot as plt
from itertools import combinations

import pickle

In [46]:
def df2array(df):
    # 将数据以 ndarray 的形式读入内存
    # [[x1,x2,...,x500,y1,y2,...,y500],[],...]
    i = 0
    data = np.array([])
    while i <= len(df)-1:
        arr_x = np.array(df['x'][i:i+500])
        arr_y = np.array(df['y'][i:i+500])
        arr = np.append(arr_x,arr_y)
        if len(data)==0:
            data = np.array([arr])
        else:
            data = np.append(data,[arr],axis=0)
        i+=500
    return data

In [47]:
# 确保定义的距离是有效的，非负性，同一性，对称性，直递性（三角不等式）

def dist(tr1,tr2):
    length = len(tr1)
    if not(length == 1000):
        # 传入参数形式为(k,1000)
        # 计算k个簇心点更新前后的距离
        distance = 0
        for i in range(length):
            pre = tr1[i]
            now = tr2[i]
            half = int(len(pre)/2)
            x1 = pre[0:half]
            y1 = pre[half:]
            x2 = now[0:half]
            y2 = now[half:]
            deta_x = x1 - x2
            deta_y = y1 - y2
            distance = distance + np.sum((deta_x**2 + deta_y**2) ** 0.5) / len(x1)
    else:       
        half = int(length/2)
        x1 = tr1[0:half]
        y1 = tr1[half:]
        x2 = tr2[0:half]
        y2 = tr2[half:]
        deta_x = x1 - x2
        deta_y = y1 - y2
        distance = np.sum((deta_x**2 + deta_y**2) ** 0.5) / len(x1)
        # print(distance)
    return distance

In [48]:
def cluster(data,K):

    metric = distance_metric(type_metric.USER_DEFINED, func=dist)
    initial_centers = kmeans_plusplus_initializer(data, K).initialize()

    kmeans_instance = kmeans(data, initial_centers, metric=metric)

    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()
    
    return clusters,centers

In [49]:
def sample(data,K,alpha,nodeNum):  # 选择抽样轨迹的person_id
    if nodeNum == 1:
        DF = df1
    elif nodeNum == 2:
        DF = df2
    else:
        DF = df3
    clusters,centers = cluster(data,K)
    sample_tra_num = np.array([])
    for i in range(K):    
        c = clusters[i]  
        c = [DF['person_id'][i*500] for i in c]
        num = int(len(c) * alpha)
        pro = [1/len(c) for i in range(len(c))]
        tmp = np.random.choice(c,num,p = pro,replace=False)
        if len(sample_tra_num) == 0:
            sample_tra_num = tmp
        else:
            sample_tra_num = np.append(sample_tra_num,tmp)
              
    return sample_tra_num

In [50]:
def draw(data):
    for i in range(len(data)):
        x = data[i][:500]
        y = data[i][500:]
        plt.plot(x,y,linewidth = 0.2)
    plt.show()

In [51]:
def genSampleData(data,k,nodeNum):
    alpha = [0.1,0.3,0.5,0.7,0.9]
    alpha_str = ['01','03','05','07','09']
    if nodeNum == 1:
        DF = df1
    elif nodeNum == 2:
        DF = df2
    else:
        DF = df3
    for i in range(len(alpha)):
        sample_id = sample(data,k,alpha[i],nodeNum)
        for j in range(len(sample_id)):
            if j == 0:
                df = DF[DF['person_id'] == sample_id[0]]
            else:
                frames = [df,DF[DF['person_id'] == sample_id[j]]]
                df = pd.concat(frames)
        file_name = 'data/node'+str(nodeNum)+'K'+str(k)+'a'+alpha_str[i]+'.csv'
        with open(file_name,'wb') as f:
            df.to_csv(file_name,index = False)

In [8]:
df1 = pd.read_csv('./data/node1.csv')

In [9]:
data1 = df2array(df1)

In [40]:
genSampleData(data1,5,1)

In [41]:
df2 = pd.read_csv('./data/node2.csv')
DATA2 = df2array(df2)

In [42]:
genSampleData(DATA2,5,2)

In [43]:
df3 = pd.read_csv('./data/node3.csv')
DATA3 = df2array(df3)

In [44]:
genSampleData(DATA3,5,3)