In [1]:
import pandas as pd
import numpy as np
import os


# input csv

In [3]:
csv_path=os.path.join(os.path.expanduser("~"),'station2grid','datasets','csv','sat.csv')
df=pd.read_csv(csv_path)
print(df.shape)
df.head()

(2160819, 6)


Unnamed: 0,dt,lat,lon,pm25,pm10,temperature
0,2015-01-01 11:00:00,25.3,120.0,67.607,99.278,12.315
1,2015-01-01 11:00:00,25.3,120.125,62.508,89.554,12.166
2,2015-01-01 11:00:00,25.3,120.25,57.408,79.822,12.016
3,2015-01-01 11:00:00,25.3,120.375,53.502,73.968,11.95
4,2015-01-01 11:00:00,25.3,120.5,51.96,75.797,12.051


In [None]:
csv_path

In [None]:
domain=csv_path.split('/')[-1].split('.')[0]
domain

In [None]:
dts=df.dt.unique()

In [None]:
one_dt=df[df.dt==dts[-10]]
one_dt.head()

# domain features

In [None]:
features=df.columns[3:].tolist()
features

In [None]:
path=os.path.join('..','info','%s-feature.csv'%(domain))
dummy=pd.DataFrame({'feature':features})
dummy.to_csv(path)

# sources2grid by KNN, one dt

In [5]:
import sys 
sys.path.append(os.path.join(os.path.expanduser("~"),'station2grid'))
from tools import CustomKNN


In [None]:
customKNN=CustomKNN()

In [None]:
x_train=one_dt[['lat','lon']]
y_train=one_dt.iloc[:,3:]
x_train.shape,y_train.shape

In [None]:
k,weightKNN=3,'distance'
knn_grid=customKNN.get_knn_grid(k,weightKNN,x_train,y_train)
knn_grid.shape

In [None]:
grid_pm25=knn_grid[...,:1]
grid_pm25.shape

# grid2station, one dt

In [None]:
epaStationInfo=pd.read_csv(os.path.join(os.path.expanduser("~"),'station2grid','datasets','info','epa-station-info.csv'))
epaStationInfo.head(2)

In [None]:
station=knn_grid[:,epaStationInfo.row,epaStationInfo.col,:]
station.shape

# grid2station, one month

In [None]:
single='domain_%s-k_%s-weightKNN_%s'%(domain,k,weightKNN)
single

In [None]:
grid_dir=os.path.join(os.path.expanduser("~"),'station2grid','datasets','npy',domain,single,'grid')
station_dir=os.path.join(os.path.expanduser("~"),'station2grid','datasets','npy',domain,single,'station')

grid_dir,station_dir


In [None]:
os.makedirs(grid_dir,exist_ok=True)
os.makedirs(station_dir,exist_ok=True)


In [None]:
for i,dt in enumerate(dts[:2]):
    one_dt=df[df.dt==dt]
    
    x_train=one_dt[['lat','lon']]
    y_train=one_dt.iloc[:,3:]
    k,weightKNN=3,'distance'
    
    knn_grid=customKNN.get_knn_grid(k,weightKNN,x_train,y_train)
    
    grid_pm25=knn_grid[...,:1]
    station=knn_grid[:,epaStationInfo.row,epaStationInfo.col,:]
    
    dt_str=str(dt)[:19]
    np.save(os.path.join(grid_dir,dt_str+'_grid'),grid_pm25)
    np.save(os.path.join(station_dir,dt_str+'_station'),station)


# script

In [22]:
class Csv2npy:
    def __init__(self,csv_path,k,weightKNN,threshold):
        self.home=os.path.expanduser("~")
        self.epaStationInfo=pd.read_csv(os.path.join(self.home,'station2grid','datasets','info','epa-station-info.csv'))
        self.csv_path=csv_path
        self.domain=csv_path.split('/')[-1].split('.')[0]
        self.single='domain_%s-k_%s-weightKNN_%s'%(self.domain,k,weightKNN)
        self.k=k
        self.weightKNN=weightKNN
        self.customKNN=CustomKNN()
        self.threshold=threshold
        
    def saveFeatures(self,features):
        path=os.path.join(self.home,'station2grid','datasets','info','%s-features.csv'%(self.domain))
        dummy=pd.DataFrame({'feature':features})
        dummy.to_csv(path)
    
    def get_grid_station(self,one_dt,k,weightKNN):
        customKNN=self.customKNN
        
        x_train=one_dt[['lat','lon']]
        y_train=one_dt.iloc[:,3:]
        knn_grid=customKNN.get_knn_grid(k,weightKNN,x_train,y_train)

        grid_pm25=knn_grid[...,:1]
        station=knn_grid[:,self.epaStationInfo.row,self.epaStationInfo.col,:]
        return grid_pm25,station

    def alldt2npy(self):
        domain=self.domain
        home=self.home
        single=self.single
        
        df=pd.read_csv(self.csv_path)
        
        # save features for single domain
        features=df.columns[3:].tolist()
        self.saveFeatures(features)

        # make dir for (grid,station)
        grid_dir=os.path.join(home,'station2grid','datasets','npy',domain,single,'grid')
        station_dir=os.path.join(home,'station2grid','datasets','npy',domain,single,'station')
        os.makedirs(grid_dir,exist_ok=True)
        os.makedirs(station_dir,exist_ok=True)
        
        dts=df.dt.unique()
        for dt in dts[:20]:
            one_dt=df[df.dt==dt]
            
            # if number of sources < threshold, pass
            print(dt,len(one_dt))
            if len(one_dt)<self.threshold: 
                print('pass')
                continue
                
            grid_pm25,station=self.get_grid_station(one_dt,self.k,self.weightKNN)
            # save npy (grid,station)
            dt_str=str(dt)[:19]
            np.save(os.path.join(grid_dir,dt_str+'_grid'),grid_pm25)
            np.save(os.path.join(station_dir,dt_str+'_station'),station)


In [23]:
csv_path=os.path.join('..','csv','air.csv')
k,weightKNN,threshold=3,'distance',308

In [24]:
csv2npy=Csv2npy(csv_path,k,weightKNN,threshold)
csv2npy.alldt2npy()

2017-01-01 08:00:00 307
pass
2017-01-01 08:30:00 312
2017-01-01 09:00:00 312
2017-01-01 09:30:00 310
2017-01-01 10:00:00 313
2017-01-01 10:30:00 311
2017-01-01 11:00:00 312
2017-01-01 11:30:00 313
2017-01-01 12:00:00 311
2017-01-01 12:30:00 308
2017-01-01 13:00:00 309
2017-01-01 13:30:00 311
2017-01-01 14:00:00 310
2017-01-01 14:30:00 309
2017-01-01 15:00:00 309
2017-01-01 15:30:00 307
pass
2017-01-01 16:00:00 309
2017-01-01 16:30:00 309
2017-01-01 17:00:00 311
2017-01-01 17:30:00 312
