# Notebook 3: Photo-Z regression calculator

This notebook calculates a regression fit for photometric redshifts of galaxies. It uses the following steps:

- Queries a training set of galaxies for which redshifts are available (redshift is a rough measure of the distance to a galaxy).
- Derives a k-nearest-neighbour regressor that determines a redshift estimate from 5 "broad-band" magnitudes
- Tests the derived regressor against a test dataset by plotting actual redshifts of galaxies in the test set against redshifts predicted by the regressor.

Author: Jake VanderPlas (vanderplas@astro.washington.edu)

License: BSD

The figure is an example from astroML: see http://astroml.github.com

Modified to work with SDSS data by Dmitry Medvedev (dmedv@jhu.edu)

In [None]:
import SciServer.CasJobs as CasJobs # work with CasJobs
from io import StringIO   # work with strings
import numpy as np   # standard Python math package
import pandas   # "pandas" package for data processing
from matplotlib import pyplot as plt   # plotting
from sklearn.neighbors import KNeighborsRegressor   # find regressor
from astroML.plotting import scatter_contour   # make scatterplot
print("OK")

In [None]:
# some special settings
# ensure columns get written completely in notebook
pandas.set_option('display.max_colwidth', -1)
# do *not* show python warnings 
import warnings
#warnings.filterwarnings('ignore')
print("OK")

In [None]:
NOBJECTS =  20000
GAL_COLORS_DTYPE = [('u', float),
                    ('g', float),
                    ('r', float),
                    ('i', float),
                    ('z', float),
                    ('redshift', float),
                    ('redshift_err', float)]

Now we do this

In [None]:
# get data from CasJobs
query_text = ('\n'.join(
    ("SELECT TOP %i" % NOBJECTS,
    "  p.objId, p.u, p.g, p.r, p.i, p.z, s.z as redshift, s.zerr",
    "FROM PhotoObj AS p",
    "   JOIN SpecObj AS s ON s.bestobjid = p.objid",
    "WHERE ",
    "   p.u BETWEEN 0 AND 19.6",
    "   AND p.g BETWEEN 0 AND 20",
    "   AND (s.class = 'GALAXY' OR s.class = 'QSO')")))

data = CasJobs.executeQuery(query_text, "DR14")
data = data.set_index('objId')

# Show the first 10 rows
data.head(10)

In [None]:
n_neighbors = 1

N = len(data)

# shuffle data
np.random.seed(0)
np.random.shuffle(data.as_matrix())

# put colors in a matrix
X = np.zeros((N, 4))
X[:, 0] = data['u'] - data['g']
X[:, 1] = data['g'] - data['r']
X[:, 2] = data['r'] - data['i']
X[:, 3] = data['i'] - data['z']
z = data['redshift'].as_matrix()

# divide into training and testing data
Ntrain = int(np.round(N/2, 0))

Xtrain = X[0:Ntrain]
ztrain = z[:Ntrain]

Xtest = X[Ntrain:]
ztest = z[Ntrain:]

knn = KNeighborsRegressor(n_neighbors, weights='uniform')
zpred = knn.fit(Xtrain, ztrain).predict(Xtest)

rms = np.sqrt(np.mean((ztest - zpred) ** 2))
print("RMS error = {0:.2f}".format(rms))

In [None]:
axis_lim = np.array([-0.1, 2.5])

plt.figure(figsize=(12, 8))
ax = plt.axes()
plt.scatter(ztest, zpred, c='k', lw=0, s=4)
plt.plot(axis_lim, axis_lim, '--k')
plt.plot(axis_lim, axis_lim + rms, ':k')
plt.plot(axis_lim, axis_lim - rms, ':k')
plt.xlim(axis_lim)
plt.ylim(axis_lim)

plt.text(0.99, 0.02, "RMS error = %.2g" % rms,
         ha='right', va='bottom', transform=ax.transAxes,
         bbox=dict(ec='w', fc='w'), fontsize=16)

plt.title('Photo-z: Nearest Neigbor Regression')
plt.xlabel(r'$\mathrm{z_{spec}}$', fontsize=20)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=20)
plt.show()