In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [45]:
# Import data #TODO: add data to github and make this a local import
#path = '/mnt/c/Users/han/Downloads/merged_obs_sim_features_ref_plus_dist.csv'
path = '/Users/monicazhu/Box/CS189-Project-Shared/obs_feature_merge/merged_obs_full_features_distilled.csv'
raw = pd.read_csv(path)

Feature engineering

In [46]:
raw.columns[1:17]

Index(['Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'Unnamed: 0.1.1.1.1', 'timestamp', 'co2', 'temp', 'vaisala_temp', 'lon',
       'lat', 'emis_files', 'fp_files', 'ref_co2', 'ref_lon', 'ref_lat',
       'ref_temp'],
      dtype='object')

In [55]:
### Feature engineering
### 1. Separate out features of emissions weighted by footprints
emis_fps = [column for column in raw.columns if column.startswith('ef_') ]
### 2. Location features; instead of using longitude and latitude of each observation, we using the lon and lat
### difference relavant to reference site
raw['lon_diff'] = raw['lon'] - raw['ref_lon']
raw['lat_diff'] = raw['lat'] - raw['ref_lat']
locs_feas = ['lon_diff','lat_diff']
### 3. Temperature feature to capture the bias; we using the temperature enhancement as the feature 
### (temperature - environment temperature)
raw['temp_diff'] = raw['vaisala_temp'] - raw['ref_temp']
temp_feas = ['temp_diff']
### 4. Final features sets are the combination of those three types of features
fea_sets = emis_fps + locs_feas + temp_feas
fea_df = raw.loc[:, fea_sets]
### 5. Add last feature: CO2 measurments from each observation;
fea_df['co2'] =  raw["co2"]

### 6.We standerize all features except for CO2 measurements
normalizer = StandardScaler()
fits = normalizer.fit(fea_df)
stand_df = pd.DataFrame(normalizer.transform(fea_df),columns =fea_df.columns)
### 7. Add a constant bias term
stand_df.loc[:, 'const'] = 1

y = raw["ref_co2"]

In [56]:
# drop redundant columns
column_names = stand_df.columns
nonredundant_columns = column_names[(stand_df.loc[:, column_names] > 1e-5).any()]
redundant_columns = [column for column in column_names if column not in nonredundant_columns]
stand_df = stand_df.drop(redundant_columns, axis=1)

In [57]:
stand_df

Unnamed: 0,ef_dist_01,ef_dist_02,ef_dist_03,ef_dist_04,ef_dist_05,ef_dist_06,ef_dist_07,ef_dist_08,ef_dist_09,ef_dist_10,...,ef_dist_72,ef_dist_73,ef_dist_74,ef_dist_75,ef_dist_76,lon_diff,lat_diff,temp_diff,co2,const
0,-0.008619,-0.050630,-0.153929,-0.186384,-0.232059,-0.166450,-0.121084,0.109916,0.597966,0.404012,...,0.048006,0.03826,0.04251,0.041874,0.033978,-1.519951,-0.587384,2.355420,1.607124,1
1,0.029467,-0.089863,-0.193759,-0.206471,-0.237844,-0.168480,-0.119555,-0.100573,-0.112452,-0.066588,...,0.048006,0.03826,0.04251,0.041874,0.033978,-1.519951,-0.587384,2.959822,0.390438,1
2,-0.054090,-0.023184,-0.193424,-0.204551,-0.240045,-0.172849,-0.132114,-0.119707,-0.144759,-0.125314,...,0.048006,0.03826,0.04251,0.041874,0.033978,-1.519951,-0.587384,2.352538,-0.088086,1
3,-0.076200,-0.027553,-0.185884,-0.201558,-0.234256,-0.170534,-0.124546,-0.110973,-0.139192,-0.126281,...,0.048006,0.03826,0.04251,0.041874,0.033978,-1.519951,-0.587384,2.692694,-0.548105,1
4,-0.103934,0.018428,-0.156199,-0.169954,-0.218251,-0.162477,-0.112286,-0.086754,-0.073610,-0.118447,...,0.048006,0.03826,0.04251,0.041874,0.033978,-1.519951,-0.587384,1.925226,-0.618309,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31630,-0.105326,-0.024454,-0.020950,0.057765,-0.215637,-0.173822,-0.126532,-0.112290,-0.152023,-0.136790,...,0.048006,0.03826,0.04251,0.041874,0.033978,-0.477477,0.702092,1.327491,-0.750804,1
31631,-0.107295,-0.002903,-0.055372,0.074791,-0.096297,-0.171643,-0.126724,-0.112475,-0.151927,-0.136245,...,0.048006,0.03826,0.04251,0.041874,0.033978,-0.477477,0.702092,1.390136,-0.742212,1
31632,-0.107295,-0.002903,-0.055372,0.074791,-0.096297,-0.171643,-0.126724,-0.112475,-0.151927,-0.136245,...,0.048006,0.03826,0.04251,0.041874,0.033978,-0.477477,0.702092,1.390136,-0.742212,1
31633,-0.089091,-0.032612,0.281594,0.102924,-0.194483,-0.174774,-0.126839,-0.112619,-0.152062,-0.136593,...,0.048006,0.03826,0.04251,0.041874,0.033978,-0.477477,0.702092,1.368957,-0.762451,1


In [58]:
print('Feature number: {}'.format(stand_df.shape[1]))
print('Observation number: {}'.format(stand_df.shape[0]))

Feature number: 81
Observation number: 31635


Train test data splitting



In [59]:
[X_train, X_test, y_train, y_test] = train_test_split(stand_df, y, test_size=0.25, random_state=80)

In [60]:
# We further split train datasets to train and validation
[X_train, X_val, y_train, y_val] = train_test_split(X_train, y_train, test_size=0.25, random_state=80)

In [61]:
# Split data
print("Train: ",X_train.shape[0]," Test: ",X_test.shape[0]," Val: ",X_val.shape[0])

Train:  17794  Test:  7909  Val:  5932


In [62]:
# Save data to files:
na = np.array(X_train.values).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/X_train', na)

na = np.array(X_test.values).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/X_test', na)

na = np.array(X_val.values).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/X_val', na)

na = np.array(y_train).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/y_train', na)

na = np.array(y_test).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/y_test', na)

na = np.array(y_val).astype(np.float32)
np.save('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/y_val', na)

In [64]:
stand_df['ref_co2'] = raw["ref_co2"]
stand_df['timestamp'] = raw['timestamp']
stand_df['dist_to_ref'] = raw['dist_to_ref']


In [65]:
stand_df.to_csv('/Users/monicazhu/Box/CS189-Project-Shared/data_v1/FinalDataset.csv')