In [1]:
import pandas as pd
import numpy as np
import os


# input csv

In [2]:
csv_path=os.path.join(os.path.expanduser("~"),'station2grid','datasets','csv','air.csv')
df=pd.read_csv(csv_path)
df.head()

Unnamed: 0,dt,lat,lon,pm25,pm10,temperature,humidity
0,2017-01-01 08:00:00,-37.888,144.998,5.0,6.0,24.37,66.0
1,2017-01-01 08:00:00,1.28,103.817,21.0,0.0,31.25,100.0
2,2017-01-01 08:00:00,1.301,103.896,14.0,15.0,30.62,79.0
3,2017-01-01 08:00:00,1.306,103.92,19.0,0.0,30.81,91.0
4,2017-01-01 08:00:00,1.323,103.869,16.0,0.0,31.75,86.0


In [3]:
csv_path

'/media/disk3/feynman52/station2grid/datasets/csv/air.csv'

In [4]:
domain=csv_path.split('/')[-1].split('.')[0]
domain

'air'

In [5]:
dts=df.dt.unique()

In [6]:
one_dt=df[df.dt==dts[-10]]
one_dt.head()

Unnamed: 0,dt,lat,lon,pm25,pm10,temperature,humidity
82969,2018-01-02 20:00:00,-6.888,107.61,50.5,0.0,24.185,100.0
82970,2018-01-02 20:00:00,-6.864,107.584,22.0,25.0,27.75,73.0
82971,2018-01-02 20:00:00,1.282,103.815,10.0,11.0,29.0,100.0
82972,2018-01-02 20:00:00,1.322,103.869,14.0,16.0,31.0,85.0
82973,2018-01-02 20:00:00,1.351,103.945,10.0,11.0,31.0,82.0


# domain features

In [7]:
features=df.columns[3:].tolist()
features

['pm25', 'pm10', 'temperature', 'humidity']

In [8]:
path=os.path.join('..','info','%s-feature.csv'%(domain))
dummy=pd.DataFrame({'feature':features})
dummy.to_csv(path)

# sources2grid by KNN, one dt

In [9]:
import sys 
sys.path.append(os.path.join(os.path.expanduser("~"),'station2grid'))
from tools import CustomKNN


In [10]:
customKNN=CustomKNN()

In [11]:
x_train=one_dt[['lat','lon']]
y_train=one_dt.iloc[:,3:]
x_train.shape,y_train.shape

((575, 2), (575, 4))

In [12]:
k,weights=3,'distance'
knn_grid=customKNN.get_knn_grid(k,weights,x_train,y_train)
knn_grid.shape

(1, 348, 204, 4)

In [13]:
grid_pm25=knn_grid[...,:1]
grid_pm25.shape

(1, 348, 204, 1)

# grid2station, one dt

In [14]:
epaStationInfo=pd.read_csv(os.path.join(os.path.expanduser("~"),'station2grid','datasets','info','epa-station-info.csv'))
epaStationInfo.head(2)

Unnamed: 0,SiteName,SiteEngName,AreaName,County,Township,SiteAddress,lon,lat,SiteType,lat_round,lon_round,row,col
0,陽明,Yangming,北部空品區,臺北市,北投區,臺北市北投區竹子湖路111號,121.529583,25.182722,公園測站,25.18,121.53,16,153
1,萬里,Wanli,北部空品區,新北市,萬里區,新北市萬里區瑪鋉路221號,121.689881,25.179667,一般測站,25.18,121.69,16,169


In [15]:
station=knn_grid[:,epaStationInfo.row,epaStationInfo.col,:]
station.shape

(1, 73, 4)

# grid2station, one month

In [16]:
single='domain_%s-k_%s-weights_%s'%(domain,k,weights)
single

'domain_air-k_3-weights_distance'

In [17]:
grid_dir=os.path.join(os.path.expanduser("~"),'station2grid','datasets','npy',domain,single,'grid')
station_dir=os.path.join(os.path.expanduser("~"),'station2grid','datasets','npy',domain,single,'station')

grid_dir,station_dir


('/media/disk3/feynman52/station2grid/datasets/npy/air/domain_air-k_3-weights_distance/grid',
 '/media/disk3/feynman52/station2grid/datasets/npy/air/domain_air-k_3-weights_distance/station')

In [18]:
os.makedirs(grid_dir,exist_ok=True)
os.makedirs(station_dir,exist_ok=True)


In [19]:
for i,dt in enumerate(dts[:2]):
    one_dt=df[df.dt==dt]
    
    x_train=one_dt[['lat','lon']]
    y_train=one_dt.iloc[:,3:]
    k,weights=3,'distance'
    
    knn_grid=customKNN.get_knn_grid(k,weights,x_train,y_train)
    
    grid_pm25=knn_grid[...,:1]
    station=knn_grid[:,epaStationInfo.row,epaStationInfo.col,:]
    
    dt_str=str(dt)[:19]
    np.save(os.path.join(grid_dir,dt_str+'_grid'),grid_pm25)
    np.save(os.path.join(station_dir,dt_str+'_station'),station)


# script

In [32]:
class Csv2npy:
    def __init__(self,csv_path,k,weights,threshold):
        self.home=os.path.expanduser("~")
        self.epaStationInfo=pd.read_csv(os.path.join(self.home,'station2grid','datasets','info','epa-station-info.csv'))
        self.csv_path=csv_path
        self.domain=csv_path.split('/')[-1].split('.')[0]
        self.single='domain_%s-k_%s-weights_%s'%(self.domain,k,weights)
        self.k=k
        self.weights=weights
        self.customKNN=CustomKNN()
        self.threshold=threshold
        
    def saveFeatures(self,features):
        path=os.path.join(self.home,'station2grid','datasets','info','%s-features.csv'%(self.domain))
        dummy=pd.DataFrame({'feature':features})
        dummy.to_csv(path)
    
    def get_grid_station(self,one_dt,k,weights):
        customKNN=self.customKNN
        
        x_train=one_dt[['lat','lon']]
        y_train=one_dt.iloc[:,3:]
        knn_grid=customKNN.get_knn_grid(k,weights,x_train,y_train)

        grid_pm25=knn_grid[...,:1]
        station=knn_grid[:,self.epaStationInfo.row,self.epaStationInfo.col,:]
        return grid_pm25,station

    def alldt2npy(self):
        domain=self.domain
        home=self.home
        single=self.single
        
        df=pd.read_csv(self.csv_path)
        
        # save features for single domain
        features=df.columns[3:].tolist()
        self.saveFeatures(features)

        # make dir for (grid,station)
        grid_dir=os.path.join(home,'station2grid','datasets','npy',domain,single,'grid')
        station_dir=os.path.join(home,'station2grid','datasets','npy',domain,single,'station')
        os.makedirs(grid_dir,exist_ok=True)
        os.makedirs(station_dir,exist_ok=True)
        
        dts=df.dt.unique()
        for dt in dts[:5]:
            one_dt=df[df.dt==dt]
            
            # if number of sources < threshold, pass
            print(dt,len(one_dt))
            if len(one_dt)<self.threshold: 
                print('pass')
                continue
                
            grid_pm25,station=self.get_grid_station(one_dt,k,weights)
            # save npy (grid,station)
            dt_str=str(dt)[:19]
            np.save(os.path.join(grid_dir,dt_str+'_grid'),grid_pm25)
            np.save(os.path.join(station_dir,dt_str+'_station'),station)


In [33]:
csv_path=os.path.join('..','csv','air.csv')
k,weights,threshold=3,'distance',312

In [34]:
csv2npy=Csv2npy(csv_path,k,weights,threshold)
csv2npy.alldt2npy()

2017-01-01 08:00:00 307
pass
2017-01-01 08:30:00 312
2017-01-01 09:00:00 312
2017-01-01 09:30:00 310
pass
2017-01-01 10:00:00 313
