## 1) Spark Python Shell

In [None]:
pyspark --master local[2]

### 1.1) Reading csv file

In [None]:
matches = spark.read.format('csv').options(header='true',
                                      inferschema='true').load(
                                          'hdfs://localhost:8020/user/vagrant/test2/spark/matches.csv'
                                      )
df.show()

### 1.2) Filtering, grouping, joining data

In [None]:
df1 = df.filter(df['Date'] == '2015-03-01 00:00:00')
df2 = df.filter(df['Date'] == '2015-03-02 00:00:00')
print("df1 count = " + str(df1.count()))
print("df2 count = " + str(df2.count()))

In [None]:
df1 = df1.groupBy('TimePeriod').agg({'Flow':'sum','AverageSpeed':'max'})
df2 = df2.groupBy('TimePeriod').agg({'Flow':'sum','AverageSpeed':'max'})

In [None]:
df1.join(df2, 
         df1['TimePeriod'] == df2['TimePeriod']).filter(
             df1['max(AverageSpeed)']>df2['max(AverageSpeed)']*1.25
         ).show()

## 2) Spark Python script

### 2.1) Load text file ond HDFS and do word count

In [None]:
import sys
from pyspark import SparkContext, SparkConf

if __name__ == "__main__":
    # creating spark context
    conf = SparkConf().setAppName("Word Count")
    sc = SparkContext(conf = conf)
    
    # reading files and spliting each document into words
    words = sc.textFile("file:///user/vagrant/data/data.csv").flatMap(
        lambda line: line.split(" "))
    
    # count the occurence of each word
    wordCounts = words.map(lambda word: (word, 1)).reduceByKey(
        lambda a,b: a+b).sortBy(lambda a: a[1])
    wordCounts.saveAsTextFile("file:///user/vagrant/data/output.cnt")
    print("completed successfully")

In [None]:
## submit a spark job and validate results stored in HDFS:
spark-submit --master local[2] wordcount.py

### 2.2) Configuring a logger to record program

In [None]:
import sys
import logging
from pyspark.sql import SparkSession

# Logging confuguration
formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line %(lineno)d: %(message)s')
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.setLevel(logging.IFO)
logger.addHandler(handler)

def main():
    #start spark session
    spark = SparkSession.builder.appName("SparkDemo").getOrdCreate()
    spark.sparkContext.setLogLevel("ERROR")
    logger.info("Starting spark application")
    logger.info("Reading CSV File")
    
    df = spark.read.option("header","true").option(
        "inferschema","true"
    ).csv("hdfs://localhost:8020/user/vagrant/data/data.csv")
    logger.info("previewing csv file")
    df.show()
    
    logger.info("Data Aggregation")
    df.filter(df["TimePeriod"].between(10,12)).groupBy(
        "LinkRef").agg({"Flow","avg"}).show()
    df.createOrReplaceTempView("RoadData")

    spark.sql("SELECT LinkRef, AVG(Flow) FROM RoadData WHERE \
              TimePeriod BETWEEN 10 AND 12 GROUPBY LinkRef").show()
    logger.info("Ending spark application")
    spark.stop()
    return None

if __name__ == "__main__":
    main()
    sys.exit()

### 2.3) findspark jupyter

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Jupyter Demo").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.read.option("header","true").option(
    "inferschema","true").csv("hdfs://localhost:8020/user/vagrant/data.csv")
df.show()