### Przygotowanie danych

```bash
mkdir data
cd data
curl -L -o donation.zip http://bit.ly/1Aoywaq
unzip donation.zip
unzip 'block_*.zip'
```

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
# create dataframe 
prev = spark.read.csv("data/block*.csv")
prev

In [None]:
prev.show(2)

prev.show()

In [None]:
# dodatkowe opcje z header i wartości null 
parsed = spark.read.option("header", "true")\
.option("nullValue", "?")\
.option("inferSchema", "true")\
.csv("data/block*.csv")

In [None]:
parsed.show(5)

parsed.printSchema()

## inne formaty 

- parquet
- orc
- json
- jdbc
- avro
- yrxy
- image
- libsvm
- binary
- xml

In [None]:
parsed.write.format("parquet").save("data/block2.parquet")

In [None]:
t = spark.read.format("parquet").load("data/block2.parquet")

In [None]:
t.show(2)

## schematy danych 

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

schema = StructType([
  StructField("Date", StringType(), True),
  StructField("Open", DoubleType(), True),
  StructField("High", DoubleType(), True),
  StructField("Low", DoubleType(), True),
  StructField("Close", DoubleType(), True),
  StructField("Volume", IntegerType(), True),
  StructField("Name", StringType(), True)
])


ddlSchemaStr = """Date STRING, Open FLOAT, High FLOAT, 
Low FLOAT, Close FLOAT, Voulme INT, Name String 
"""

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.read.option("header", True)\
.csv("AAPL_2006-01-01_to_2018-01-01.csv", schema=ddlSchemaStr)

df.show(5)

## dane niustrukturyzowane

%%file test.json

{
 "id": "0001",
 "type": "donut",
 "name": "Cake",
 "ppu": 0.55,
 "batters":
  {
   "batter":
    [
     { "id": "1001", "type": "Regular" },
     { "id": "1002", "type": "Chocolate" },
     { "id": "1003", "type": "Blueberry" }
    ]
  },
 "topping":
  [
   { "id": "5001", "type": "None" },
   { "id": "5002", "type": "Glazed" },
   { "id": "5005", "type": "Sugar" },
   { "id": "5007", "type": "Powdered Sugar" },
   { "id": "5006", "type": "Chocolate with Sprinkles" },
   { "id": "5003", "type": "Chocolate" },
   { "id": "5004", "type": "Maple" }
  ]
}

In [None]:
rawDFjson = spark.read.json("test.json", multiLine = "true")

In [None]:
rawDFjson.printSchema()

In [None]:
sampleDF = rawDFjson.withColumnRenamed("id", "key")

In [None]:
batDF = sampleDF.select("key", "batters.batter")
batDF.printSchema()
batDF.show(1, False)

In [None]:
from pyspark.sql.functions import explode
bat2DF = batDF.select("key", explode("batter").alias("new_batter"))
bat2DF.show()

In [None]:
bat2DF.printSchema()

In [None]:
bat2DF.select("key", "new_batter.*").show()

In [None]:
finalBatDF = (sampleDF
        .select("key",  
explode("batters.batter").alias("new_batter"))
        .select("key", "new_batter.*")
        .withColumnRenamed("id", "bat_id")
        .withColumnRenamed("type", "bat_type"))
finalBatDF.show()

In [None]:
topDF = (sampleDF
        .select("key", explode("topping").alias("new_topping"))
        .select("key", "new_topping.*")
        .withColumnRenamed("id", "top_id")
        .withColumnRenamed("type", "top_type")
        )
topDF.show(10, False)