# Batch Processing with Spark

## Spark Configuration
| Setting | Value | Description |
| ---- | ---- | ---- |
| spark.master | spark://spark-master:7077 | - |
| spark.jars.packages | org.apache.spark:spark-avro_2.11:2.4.0,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 | - |

In [1]:
%sh

hdfs dfs -ls /data/santander/sensor_traffic/

In [2]:
%spark

// Crearemos un Dataframe con la información obtenida del JSON
val sensorTrafficDF = spark.read.format("avro").load("hdfs://hdfs-namenode:8020/data/santander/sensor_traffic/")
sensorTrafficDF.show()

In [3]:
%pyspark

sensorTrafficDF = spark.read.format("avro").load("hdfs://hdfs-namenode:8020/data/santander/sensor_traffic/")
sensorTrafficDF.show()

In [4]:
%sh

hdfs dfs -ls /data/santander/tweets_turismosdr


In [5]:
%pyspark

tweetsDF = spark.read.format("avro").load("hdfs://hdfs-namenode:8020/data/santander/tweets_turismosdr/")
tweetsDF.show()


In [6]:
%pyspark

tweetsDF.printSchema()

In [7]:
%pyspark

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

process_text_udf = udf(lambda text: len(text), IntegerType())

tweetsSummaryDF = tweetsDF.select("id", "created_at", "lang", "text", "user.screen_name") \
    .withColumnRenamed("screen_name", "user_name") \
    .withColumn("length", process_text_udf(tweetsDF['text']))


tweetsSummaryDF.show()


In [8]:
%pyspark

tweetsSummaryDF.createOrReplaceTempView("tweets_summary")

sql_statement = """
SELECT lang, COUNT(*), AVG(length) AS avg_length FROM tweets_summary GROUP BY lang

"""

spark.sql(sql_statement).write.format('jdbc').options(
      url='jdbc:mysql://mysql/big-data-db',
      driver='com.mysql.cj.jdbc.Driver',
      dbtable='tweets_length_by_lang',
      user='big-data-user',
      password='big-data-user').save()

In [9]:
%mysql

show tables;
select * from tweets_length_by_lang;


```shell
docker exec -it big-data-lab-mysql /usr/bin/mysql -u big-data-user -p
Enter password: 
Welcome to the MySQL monitor.  Commands end with ; or \g.
Your MySQL connection id is 24
Server version: 8.0.22 MySQL Community Server - GPL

Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved.

Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

mysql> use big-data-db
Database changed
mysql> show tables;
+-----------------------+
| Tables_in_big-data-db |
+-----------------------+
| tweets_length_by_lang |
+-----------------------+
1 row in set (0.01 sec)

mysql> select * from tweets_length_by_lang limit 10;
+------+----------+--------------------+
| lang | count(1) | avg_length         |
+------+----------+--------------------+
| en   |     3430 |  90.86297376093295 |
| vi   |        9 | 50.666666666666664 |
| ne   |        4 |              85.75 |
| ps   |        1 |                132 |
| ro   |        2 |              113.5 |
| sl   |        1 |                 29 |
| und  |      761 |  36.81997371879106 |
| ur   |       15 | 118.06666666666666 |
| lv   |        3 |                 23 |
| pl   |        8 |             60.625 |
+------+----------+--------------------+
10 rows in set (0.00 sec)

mysql> 

mysql> 
```
