In [13]:
# Dataset 은 샘플과 정답(label)을 저장하고, 
# DataLoader 는 Dataset 을 샘플에 쉽게 접근할 수 있도록 순회 가능한 객체(iterable)로 감쌉니다.
# https://wikidocs.net/156998

import torch
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self):
    # 생성자, 데이터를 전처리하는 부분
        self.x_data = [[73, 80, 75],
                       [93, 99, 95]]
        self.y_data = [[152], 
                       [185]]
    
    def __len__(self):
    # 데이터셋의 총 길이를 반환하는 부분
        return len(self.x_data)
    
    def __getitem__(self, idx):
    # idx 에 해당하는 입출력 데이터를 반환한다.
        x = torch.FloatTensor(self.x_data[idx])
        y = torch.FloatTensor(self.y_data[idx])
        return x, y
    
customData = CustomDataset()
customData.__getitem__(1)

(tensor([93., 99., 95.]), tensor([185.]))

In [25]:
import os
import pandas as pd

data_dir = 'Data/'
csv_dir = 'csv/'
csv_extension = '.csv'
user_path_list = os.listdir(data_dir)


csv_path = os.path.join(data_dir, user_path_list[0], csv_dir)
user_file = csv_path + user_path_list[0] + '.csv'
df = pd.read_csv(user_file)
df.head()



Unnamed: 0,latitude,longitude,what,altitude,days,date,time
0,39.984702,116.318417,0,492,39744.120185,2008-10-23,02:53:04
1,39.984683,116.31845,0,492,39744.120255,2008-10-23,02:53:10
2,39.984686,116.318417,0,492,39744.120313,2008-10-23,02:53:15
3,39.984688,116.318385,0,492,39744.12037,2008-10-23,02:53:20
4,39.984655,116.318263,0,492,39744.120428,2008-10-23,02:53:25


In [130]:
import os
import pandas as pd
import numpy as np
import random
from torch.utils.data import Dataset

# Data folder 중 숫자가 안되는 User folder는 삭제하고
# 남은 User data에서 train-test 폴더로 나눈 후
# train_set(dataset), test_set(dataset) 으로 진행 필요

class GeoLifeDataSet(Dataset):
    def __init__(self, data_dir, user_list, samples_s, samples_q, length, y_timestep):
        self.data_dir   = data_dir
        self.csv_dir    = 'csv/'
        self.user_list  = user_list
        # user_list: all user
        self.samples_s  = samples_s
        # samples_s: the number of support set
        self.samples_q  = samples_q
        # samples_q: the number of query set
        self.length     = length 
        # length: the length of mini batch of a user
        self.y_timestep = y_timestep
        # y_time_step: the next time step to be predicted
        #              it must be less than length
    
    def sampleTime(self, dataset):
        cur_ds = dataset.copy()
        minibatch = []
        
        max_len = len(cur_ds)
        ###############################################
        # MAke sure samples from query and support 
        # do not intersect
        ##############################################
        # total_data_slice -> lenght 만큼 나눴을 때 총 slice 갯수
        total_data_slice = list(range(int(max_len/self.length)))
        total_samps = self.samples_q + self.samples_s
        
        slice_point = int(len(total_data_slice)*(self.samples_s/total_samps))
        # print(f"slice_point: {slice_point}")

        s_s_list = total_data_slice[:slice_point]
        q_s_list = total_data_slice[slice_point:]

        replace = False
        if total_samps > len(total_data_slice):
            replace = True

        s_s_list = np.random.choice(s_s_list, size=self.samples_s, replace=replace)
        q_s_list = np.random.choice(q_s_list, size=self.samples_q, replace=replace)
        
        # print(f"s_list:{s_s_list}")
        # print(f"q_list:{q_s_list}")
        choice_list = np.concatenate([s_s_list, q_s_list])
        # #################################################
        # print(f"choice_list: {choice_list}")
        
        for idx in choice_list:
            start_idx = idx * self.length
            if max_len - self.length >= 0:
                cur_sample = cur_ds.iloc[start_idx:(start_idx + self.length), :]
                minibatch.append(cur_sample)
            else:
                fill_quota  = np.abs(self.length - max_len)
                zeros_r     = np.zeros([fill_quota, cur_ds.shape[1]])
                cur_sample  = cur_ds[:, :]
                cur_sample  = np.concatenate([zeros_r, cur_sample], axis = 0)
                minibatch.append(cur_sample)
        return np.array(minibatch)
        
    def __getitem__(self, index):
        csv_path = os.path.join(self.data_dir, self.user_list[index], self.csv_dir)
        user_file = csv_path + self.user_list[index] + '.csv'
        df = pd.read_csv(user_file)
        df = df[['days','latitude', 'longitude']]

        samples = self.sampleTime(df)
        # print(f"mini_batch: {samples.shape}")
        # mini_batch: (5, 10, 3)
        
        sup_x = np.array(samples[:self.samples_s, :-self.y_timestep, :])
        sup_y = np.array(samples[:self.samples_s, -self.y_timestep:, -2:])
        que_x = np.array(samples[self.samples_s:, :-self.y_timestep, :])
        que_y = np.array(samples[self.samples_s:, -self.y_timestep:, -2:])

        return (que_x, sup_x, sup_y), que_y
    
    def __len__(self):
        # batch를 구성할 수 있는 총 수
        # 이 수에서 batch를 조정할 수 있다.
        # 몇 명의 user 로 나눠서 할 지
        return len(self.user_list)

user_list = os.listdir(data_dir)
random.shuffle(user_list)
train_size = 0.1
train_list = user_list[:(int)(len(user_list)*train_size)]
print(f"train_list: {len(train_list)}")

# dataset = GeoLifeDataSet("Data/", [0, 1, 2, 3], 5, 2, 100, 10)
# dataset.__getitem__(0)

train_list: 18


In [145]:
from torch.utils.data import DataLoader

data_dir = "Data/"#"data/geolife/Data/"
sample_s = 5
sample_q = 3
length = 100
y_timestep = 10

user_list = os.listdir(data_dir)
random.shuffle(user_list)
train_size = 0.1
train_list = user_list[:(int)(len(user_list)*train_size)]
test_list  = user_list[(int)(len(user_list)*train_size):]
print(f"train_list: {len(train_list)}")

training_data = GeoLifeDataSet(data_dir, train_list, sample_s, sample_q, length, y_timestep)
test_data = GeoLifeDataSet(data_dir, train_list, sample_s, sample_q, length, y_timestep)

train_dataloader = DataLoader(training_data, batch_size=1, shuffle=False)
test_dataloader  = DataLoader(test_data, batch_size=1, shuffle=False)

train_x, train_y = next(iter(train_dataloader))
print(f"support_x: {train_x[0].shape}")
print(f"support_y: {train_x[1].shape}")
print(f"query_x: {train_x[2].shape}")
print(f"query_y: {train_y.shape}")

train_list: 18
support_x: torch.Size([1, 3, 90, 3])
support_y: torch.Size([1, 5, 90, 3])
query_x: torch.Size([1, 5, 10, 2])
query_y: torch.Size([1, 3, 10, 2])


NameError: name 'shap' is not defined

In [4]:
shape = [1, 2, 3, 4, 5]
shape[:-2] + [-1] + [2]

[1, 2, 3, -1, 2]

In [None]:
        # input is TASKS x SAMPLES x FEATURES x TIME x Latent vector
        shape = torch._shape_as_tensor(inp)
        # (3, 20, 6, 100, 1)
        x = torch.reshape(inp, [-1, shape[-2], shape[-1]])
        # (300, 100, 1)
        x, f = self.gru(x)
        # x:(300, 100, 32)
        # f:(3, 100, 32)
        
        if self.final:
            new_shape = shape[:-2].tolist() + [-1]
            out = torch.reshape(f, new_shape)
        else:
            new_shape = shape[:-1].tolist() + [-1]
            # (3, 20, 6, 100, -1)
            out = torch.reshape(x, new_shape)
            # (3, 20, 6, 100, 32)
        return out

In [None]:
# DataLoader
from torch.utils.data import DataLoader

dataloader = DataLoader(
    dataset,
    batch_size = 2,
    shuffle = True,
)

In [6]:
import pandas as pd
import numpy as np
import torch

from sklearn.cluster import KMeans

df = pd.read_csv('Data/000/csv/000.csv')
df.head(1)

Unnamed: 0,latitude,longitude,what,altitude,days,date,time
0,39.984702,116.318417,0,492,39744.120185,2008-10-23,02:53:04


In [4]:
import pandas as pd
import numpy as np
import torch

from sklearn.cluster import KMeans

df = pd.read_csv('Data/001/csv/001.csv')
df.head(5)

Unnamed: 0,latitude,longitude,what,altitude,days,date,time
0,39.984094,116.319236,0,492,39744.245197,2008-10-23,05:53:05
1,39.984198,116.319322,0,492,39744.245208,2008-10-23,05:53:06
2,39.984224,116.319402,0,492,39744.245266,2008-10-23,05:53:11
3,39.984211,116.319389,0,492,39744.245324,2008-10-23,05:53:16
4,39.984217,116.319422,0,491,39744.245382,2008-10-23,05:53:21


In [1]:
import pandas as pd
import numpy as np
import torch

from sklearn.cluster import KMeans

df = pd.read_csv('Data/000/csv/000.csv')
df.head(1)
df_temp = df[['latitude', 'longitude']].copy()

model = KMeans(n_clusters=100, random_state=123)
model.fit(df_temp)


  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
df_temp['label'] = model.labels_
df_temp['label'].value_counts()

31    23021
12    22060
72    14927
58    14561
66    11336
      ...  
99      111
29      101
43       94
91       92
96       78
Name: label, Length: 100, dtype: int64