In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
# Import data #TODO: add data to github and make this a local import
#path = '/mnt/c/Users/han/Downloads/merged_obs_sim_features_ref_plus_dist.csv'
path = '/Users/monicazhu/Box/CS189-Project-Shared/obs_feature_merge/merged_obs_full_features_ref_plus_dist_filter.csv'
raw = pd.read_csv(path)

Feature engineering

In [8]:
raw.columns[1:17]

Index(['Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'timestamp',
       'co2', 'temp', 'vaisala_temp', 'lon', 'lat', 'emis_files', 'fp_files',
       'ref_co2', 'ref_lon', 'ref_lat', 'ref_temp', 'dist_to_ref'],
      dtype='object')

In [9]:
### Feature engineering
### 1. Separate out features of emissions weighted by footprints
emis_fps = [column for column in raw.columns if column.startswith('ef_') ]
### 2. Location features; instead of using longitude and latitude of each observation, we using the lon and lat
### difference relavant to reference site
raw['lon_diff'] = raw['lon'] - raw['ref_lon']
raw['lat_diff'] = raw['lat'] - raw['ref_lat']
locs_feas = ['lon_diff','lat_diff']
### 3. Temperature feature to capture the bias; we using the temperature enhancement as the feature 
### (temperature - environment temperature)
raw['temp_diff'] = raw['vaisala_temp'] - raw['ref_temp']
temp_feas = ['temp_diff']
### 4. Final features sets are the combination of those three types of features
fea_sets = emis_fps + locs_feas + temp_feas
fea_df_except_co2 = raw.loc[:, fea_sets]
fea_df_except_co2.shape
### 5.We standerize all features except for CO2 measurements
normalizer = StandardScaler()
fits = normalizer.fit(fea_df_except_co2)
stand_df = pd.DataFrame(normalizer.transform(fea_df_except_co2),columns =fea_df_except_co2.columns)
### 6. Add last feature: CO2 measurments from each observation;
stand_df['co2'] =  raw["co2"]
y = raw["ref_co2"]

In [10]:
print('Feature number: {}'.format(stand_df.shape[1]))
print('Observation number: {}'.format(stand_df.shape[0]))

Feature number: 5969
Observation number: 31635


In [13]:
stand_df['ref_co2'] = raw["ref_co2"]
stand_df['timestamp'] = raw['timestamp']
stand_df['dist_to_ref'] = raw['dist_to_ref']


In [14]:
stand_df.to_csv('/Users/monicazhu/Box/CS189-Project-Shared/data/FinalDataset.csv')

Train test data splitting



In [58]:
[X_train, X_test, y_train, y_test] = train_test_split(stand_df, y, test_size=0.25, random_state=80)

In [59]:
# We further split train datasets to train and validation
[X_train, X_val, y_train, y_val] = train_test_split(X_train, y_train, test_size=0.25, random_state=80)

In [61]:
# Split data
print("Train: ",X_train.shape[0]," Test: ",X_test.shape[0]," Val: ",X_val.shape[0])

Train:  17794  Test:  7909  Val:  5932


In [65]:
# Save data to files:
na = np.array(X_train.values).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data/X_train', na)

na = np.array(X_test.values).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data/X_test', na)

na = np.array(X_val.values).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data/X_val', na)

na = np.array(y_train).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data/y_train', na)

na = np.array(y_test).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data/y_test', na)

na = np.array(y_val).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data/y_val', na)