In [1]:
import numpy as np
from metasense.epa import data
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split

# Loading Ozone Data

In [87]:
csvs = data.load_data('../data')
ozone_mat = []
co_mat = []
no2_mat = []
for csv, text in csvs.iteritems():                                                                                                                                                                                                            
    result = data.load_csv_from_string(text)
    ozone_mat.append(data.convert_to_matrix(result, 'ozone'))           
    co_mat.append(data.convert_to_matrix(result, 'co'))    
    no2_mat.append(data.convert_to_matrix(result, 'no2'))           

ozone_mat = np.concatenate([x for x in ozone_mat if x is not None and len(x.shape) == 2 and x.shape[1] == 9])
co_mat = np.concatenate([x for x in co_mat if x is not None and len(x.shape) == 2 and x.shape[1] == 3])
no2_mat = np.concatenate([x for x in no2_mat if x is not None and len(x.shape) == 2 and x.shape[1] == 9])

print ozone_mat.shape
print co_mat.shape
print no2_mat.shapez

(9912, 9)
(9560, 3)
(1310, 9)


In [86]:
ozone_sites = sorted(data.load_csv_from_string(csvs['20150429'])['ozone'].keys())
print ', '.join(ozone_sites)
co_sites = sorted(data.load_csv_from_string(csvs['20150429'])['co'].keys())
print ', '.join(co_sites)
no2_sites = sorted(data.load_csv_from_string(csvs['20150429'])['no2'].keys())
print ', '.join(no2_sites)

alpine, chula vista, del_mar, downtown, el cajon fsd, escondido, kearny mesa, otay mesa dvn, pendleton
downtown, el cajon fsd, escondido
alpine, carmel mt rch, chula vista, downtown, el cajon fsd, escondido, kearny mesa, otay mesa dvn, pendleton


# Linear Regression for Ozone

In [71]:
def predict(site_to_predict, sites, mat):
    print "Predicting site:", site_to_predict
    site_index = sites.index(site_to_predict)
    X, y = np.hstack([mat[:, :site_index], mat[:, site_index + 1:]]), mat[:, site_index].ravel()
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)
    model = LinearRegression()
    model.fit(Xtrain, ytrain)
    print "Training R^2:", model.score(Xtrain, ytrain)
    print "Testing R^2:", model.score(Xtest, ytest)
    print
    return model, X, y

In [69]:
models = [predict(site, ozone_sites, ozone_mat) for site in ozone_sites]

Predicting site: alpine
Training R^2: 0.408980667834
Testing R^2: 0.393714203692

Predicting site: chula vista
Training R^2: 0.860704835944
Testing R^2: 0.858922711637

Predicting site: del_mar
Training R^2: 0.762772888924
Testing R^2: 0.736508475273

Predicting site: downtown
Training R^2: 0.815250937187
Testing R^2: 0.797296912942

Predicting site: el cajon fsd
Training R^2: 0.884630933702
Testing R^2: 0.889017699288

Predicting site: escondido
Training R^2: 0.867691081888
Testing R^2: 0.871456272706

Predicting site: kearny mesa
Training R^2: 0.756464548563
Testing R^2: 0.830478591411

Predicting site: otay mesa dvn
Training R^2: 0.678767173044
Testing R^2: 0.664008238544

Predicting site: pendleton
Training R^2: 0.792960439564
Testing R^2: 0.735989223001



# Linear Regression for CO

In [88]:
models = [predict(site, co_sites, co_mat) for site in co_sites]

Predicting site: downtown
Training R^2: 0.613461472506
Testing R^2: 0.586529952486

Predicting site: el cajon fsd
Training R^2: 0.660612993356
Testing R^2: 0.658221191792

Predicting site: escondido
Training R^2: 0.661419730312
Testing R^2: 0.650131605189



# Linear Regression for NO2

In [89]:
models = [predict(site, no2_sites, no2_mat) for site in no2_sites]

Predicting site: alpine
Training R^2: 0.188820708435
Testing R^2: 0.118954036259

Predicting site: carmel mt rch
Training R^2: 0.525332316606
Testing R^2: 0.52813607902

Predicting site: chula vista
Training R^2: 0.718692209077
Testing R^2: 0.747392829833

Predicting site: downtown
Training R^2: 0.573406467243
Testing R^2: 0.641885753163

Predicting site: el cajon fsd
Training R^2: 0.554843958195
Testing R^2: 0.606817729971

Predicting site: escondido
Training R^2: 0.694272096781
Testing R^2: 0.71556380727

Predicting site: kearny mesa
Training R^2: 0.659773665525
Testing R^2: 0.705018485306

Predicting site: otay mesa dvn
Training R^2: 0.513097525959
Testing R^2: 0.468059404695

Predicting site: pendleton
Training R^2: 0.464846815135
Testing R^2: 0.552023789438

