### Jupyter basics

In [None]:
from math import pow
from scipy.constants import c 

def E(m):
    return m * pow(c,2)

In [None]:
E(10)

In [None]:
from ipywidgets import interact, widgets
interact(E, m=widgets.IntSlider(min=-0,max=30,step=1,value=10))

In [None]:
?interact

In [None]:
!pwd

1. Fun facts:
    * $E=mc^2$
        ![](http://cf.chucklesnetwork.com/items/1/1/9/3/7/0/original/im-sorry-i-cant-hear-you-over-how-awesome-science-is.jpg) 

### PySpark info and help
* Guides:
    * [Spark SQL, DataFrames and Datasets Guide](http://spark.apache.org/docs/latest/sql-programming-guide.html)
    * [Machine Learning Library (MLlib) Guide](http://spark.apache.org/docs/latest/ml-guide.html)
* API:
    * [pyspark.sql](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html)
    * [pyspark.ml](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html)
* Stack Overflow
    * ....

### Loading data into a DataFrame

In [None]:
events = (sqlContext.read.csv('hdfs://hdfs-mesos/data.csv', sep=';', inferSchema=True)
    .withColumnRenamed('_c0', 'time')
    .withColumnRenamed('_c1', 'browser')
    .withColumnRenamed('_c2', 'os')
    .withColumnRenamed('_c3', 'deviceType')
    .withColumnRenamed('_c4', 'country')
    .withColumnRenamed('_c5', 'city')
    .withColumnRenamed('_c6', 'userId')
    .cache())

events.first()

In [None]:
events.show()

### Basic aggregations

#### Filtering and counting:

In [None]:
import pyspark.sql.functions as fun

desktop_events = events.filter(fun.col('deviceType') == 'Desktop').count()

desktop_events

#### Group-by aggregations - counting:

In [None]:
device_events = events.groupBy('deviceType').count()

sorted_device_events = device_events.orderBy(fun.col('count').desc())

sorted_device_events.show()

#### Group-by aggregations - cardinality:

In [None]:
device_uniques = events.groupBy('deviceType').agg(fun.countDistinct('userId').alias('uniqueUsers'))

device_uniques.show()                                          

### Pandas

#### Converting Spark DF to Pandas DF:

In [None]:
import pandas as pd

device_uniques_pdf = device_uniques.toPandas().set_index('deviceType')

device_uniques_pdf

#### Plotting data – Pie charts:

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

device_uniques_pdf.plot.pie(subplots=True)

#### Plotting data – histograms:

In [None]:
events_per_user = events.groupBy("userId").count()

events_per_user.take(5)

In [None]:
histogram = events_per_user.rdd.values().histogram(10)

In [None]:
import numpy as np

def draw_hist(rdd_histogram_data):
  """Stolen from the Internets : Given an RDD.histogram, plot a pyplot histogram"""
  heights = np.array(rdd_histogram_data[1])
  full_bins = rdd_histogram_data[0]
  mid_point_bins = full_bins[:-1]
  widths = [abs(i - j) for i, j in zip(full_bins[:-1], full_bins[1:])]
  bar = plt.bar(mid_point_bins, heights, width=widths, log=True)
  return bar

draw_hist(histogram)

### Awesomeness - Pivoting with Pandas!

In [None]:
os_and_browser_count = events.groupBy("os", "browser").count().toPandas()

os_and_browser_count['os'] = os_and_browser_count['os'].fillna('unknown')

os_and_browser_count.head(5)

In [None]:
df = os_and_browser_count.pivot(index='os', columns='browser', values='count').fillna(0)

df

In [None]:
import numpy as np 
from pandas import DataFrame
import matplotlib.pyplot as plt

plt.pcolor(df)
plt.set_cmap('Reds')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()

In [None]:
events = events.withColumn("date", events["time"].cast("timestamp"))
events.first()

### Converting values

In [None]:
import datetime

toTimeString = fun.UserDefinedFunction(
    lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H%:M:%S'),
    fun.StringType())

dates = events.select(toTimeString(fun.col("time")).alias("date")).show(5)

In [None]:
events.select(fun.from_unixtime(fun.col("time")).alias("date")).show(5)

In [None]:
events = events.withColumn("date", fun.col("time").cast("timestamp"))
events.first()

### More magic pandas!

In [None]:
pdf = events.groupBy("date").count().toPandas().set_index("date")

In [None]:
pdf.plot()

In [None]:
pdf.plot.hist()

In [None]:
pdf.plot.box()

For more plotting examples see [pandas-docs](http://pandas.pydata.org/pandas-docs/version/0.19.1/visualization.html)

### Spark SQL

In [None]:
events.createOrReplaceTempView("events")
spark.sql("SELECT os, count(*) as events FROM events \
          WHERE browser = 'Firefox' AND deviceType = 'Desktop' \
          GROUP BY os ORDER BY events DESC").show()