In [None]:
# Check that Spark is working
largeRange = sc.parallelize(xrange(100000))
reduceTest = largeRange.reduce(lambda a, b: a + b)
filterReduceTest = largeRange.filter(lambda x: x % 7 == 0).sum()

print reduceTest
print filterReduceTest

assert reduceTest == 4999950000
assert filterReduceTest == 714264285

In [None]:
# Check loading data with sc.textFile
baseDir = 'data/cs190/'
inputFile = 'millionsong.txt'
fileName = baseDir + inputFile

rawData = sc.textFile(fileName)
songCount = rawData.count()

print songCount

assert songCount == 6724

In [None]:
# Check our testing package
from test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve should equal the hashed value of 12')

In [None]:
# Check that numpy is working
import numpy as np
a = np.arange(1.5, 6.5, .5)
b = np.arange(6.5, 11.5, .5)
x = np.asmatrix(a.reshape(2, 5))
y = np.asmatrix(b.reshape(5, 2))

dotProduct = a.dot(b)
arrayMultiply = a * b
matrixMultiply = x * y

print dotProduct
print arrayMultiply
print matrixMultiply

assert np.allclose(dotProduct, 348.75)
assert np.allclose(arrayMultiply, [ 9.75, 14., 18.75, 24., 29.75, 36., 42.75, 50., 57.75, 66.])
assert np.allclose(matrixMultiply, [[111.25, 117.5],
                                    [217.5, 230.]])

In [None]:
# Check matplotlib plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from math import log

# function for generating plot layout
def preparePlot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999', gridWidth=1.0):
    plt.close()
    fig, ax = plt.subplots(figsize=figsize)
    ax.axes.tick_params(labelcolor='#999999', labelsize='10')
    for axis, ticks in [(ax.get_xaxis(), xticks), (ax.get_yaxis(), yticks)]:
        axis.set_ticks_position('none')
        axis.set_ticks(ticks)
        axis.label.set_color('#999999')
        if hideLabels: axis.set_ticklabels([])
    plt.grid(color=gridColor, linewidth=gridWidth, linestyle='-')
    map(lambda position: ax.spines[position].set_visible(False), ['bottom', 'top', 'left', 'right'])
    return fig, ax

# generate layout and plot data
x = range(1, 100)
y = [log(x1 ** 2) for x1 in x]
fig, ax = preparePlot(range(0, 110, 10), range(0, 12, 1))
plt.scatter(x, y, s=14**2, c='#d6ebf2', edgecolors='#8cbfd0', alpha=0.75)
ax.set_xlabel(r'$range(1, 1000)$'), ax.set_ylabel(r'$\log_e(x^2)$')

 ## ** MathJax Tests **
 ### There should be a nicely rendered expression for gradient descent in (3a) and an inline formula for summand.  In (4b) there should be an expression for log loss.

 ### ** (3a) Gradient summand **
 ### Now let's see if we can do better via linear regression, training a model via gradient descent (we'll omit the intercept for now). Recall that the gradient descent update for linear regression is:* $$ \scriptsize \mathbf{w}_{i+1} = \mathbf{w}_i - \alpha_i \sum_j (\mathbf{w}_i^\top\mathbf{x}_j  - y_j) \mathbf{x}_j \,.$$ *First, implement a function that computes the summand for this update, i.e., the summand equals $ \scriptsize (\mathbf{w}^\top \mathbf{x} - y) \mathbf{x} \, ,$ and test out this function on two examples.  Use the `DenseVector` [dot](http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.DenseVector.dot) method.

 ### ** (4b) Log loss **
 ### Throughout this exercise, we will use log loss to evaluate the quality of models.  Log loss is defined as: $$  \begin{align} \scriptsize \ell_{log}(p, y) = \begin{cases} -\log (p) & \text{if } y = 1 \\\ -\log(1-p) & \text{if } y = 0 \end{cases} \end{align} $$ where $ \scriptsize p$ is a probability between 0 and 1 and $ \scriptsize y$ is a 0/1 label. Log loss is a standard evaluation criterion when predicting rare-events such as click through rate prediction (it is also the criterion used in the Criteo Kaggle competition).  Write a function to compute log loss, and evaluate it on some sample inputs.

 ## ** Test Criteo Data Download **

 ### Before we can proceed you'll need to obtain the data from Criteo.  Below is the agreement from Criteo.  After you accept the agreement you can obtain the download URL by right-clicking on the "Download Sample" button and clicking "Copy link address" or "Copy Link Location", depending on your browser.  Paste the URL into the # TODO cell below.  The file is 8.4 MB.  The script below will download the file to the virtual machine (VM) and then extract the data.

In [None]:
# Run this code to view Criteo's agreement
from IPython.lib.display import IFrame

IFrame("http://labs.criteo.com/downloads/2014-kaggle-display-advertising-challenge-dataset/",
       600, 350)

In [None]:
# TODO: Replace <FILL IN> with appropriate code
# Just replace <FILL IN> with the url for dac_sample.tar.gz
import glob
import os.path
import tarfile
import urllib
import urlparse

# Paste url, url should end with: dac_sample.tar.gz
url = '<FILL IN>'

url = url.strip()

if os.path.isfile(os.path.join(baseDir, 'dac_sample.txt')):
    print 'File is already available. Nothing to do.'
elif not url.endswith('dac_sample.tar.gz'):
    print 'Check your download url.  Are you downloading the Sample dataset?'
else:
    # Download the file and store it in the same directory as this notebook
    try:
        urllib.urlretrieve(url, os.path.join(baseDir, os.path.basename(urlparse.urlsplit(url).path)))
    except IOError:
        print 'Unable to download and store: {0}'.format(url)

    # Find the zipped archive and extract the dataset
    tars = glob.glob(os.path.join(baseDir, 'dac_sample*.tar.gz*'))
    if len(tars) > 0:
        tarFile = tarfile.open(tars[0])
        tarFile.extract('dac_sample.txt', path=baseDir)
        print 'Successfully extracted: dac_sample.txt'
    else:
        print 'You need to retry the download with the correct url.'
        print 'Alternatively, you can upload the dac_sample.tar.gz file to your IPython notebook.'

In [None]:
# Check that the download was successful
inputFile = 'dac_sample.txt'
rawData = (sc
           .textFile(os.path.join(baseDir, inputFile))
           .map(lambda x: x.replace('\t', ','))) # work with either ',' or '\t' separated data
criteoCount = rawData.count()

print criteoCount

assert rawData.count() == 100000