In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Data Ingestion From External Sources - Spark
* Generic Format
* Special Format - Need Drivers
    * Avro
    * S3
* Relational Database
    * Postgres
    * MySQL
* NoN-Relational Database
    * Cassandra

In [None]:
from pyspark.sql import SparkSession
import os
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:2.7.1,com.datastax.spark:spark-cassandra-connector_2.11:2.3.0,mysql:mysql-connector-java:8.0.15 pyspark-shell'


spark = SparkSession.builder\
                    .appName('AWS external sources spark')\
                    .getOrCreate()

In [None]:
spark

# Generic Format - Dont need drivers

* csv
* json
* parquet
* libsvm
* text

#### Read

In [None]:
spark.read.<format>("<file name>")

#### Write

In [None]:
spark.write.<format>("<file name>")

# Special Formats - Need Drivers

You can include the following packages using **--packages**

|Source| Driver Package|
|-----------|----------------|
|S3        |org.apache.hadoop:hadoop-aws:2.7.1|
|Avro       |org.apache.spark:spark-avro_2.11:2.4.0|

# S3



#### Read



In [None]:
df = spark.read.<format>("s3a://<bucket name>/<file name>")

#### Write

In [None]:
df.write.<format>("s3a://<bucket name>/<file name>", mode="overwrite")

# Relational Databases

|Source| Driver Package|Driver Name|Standard Port|
|-----------|----------------|---------|----|
|Postgres   |org.postgresql:postgresql:42.1.1|org.postgresql.Driver|5432
|MySQL       |mysql:mysql-connector-java:8.0.13|com.mysql.jdbc.Driver|3306

#### Generic Read

In [None]:
spark.read\
      .format("jdbc")\
      .option("driver", "<driver name>")\
      .option("url", "jdbc:<database type>://<ip>:<port>/<dbname>")\
      .option("dbtable", "<table>")\
      .option("user", "<username>")\
      .option("password","<password>")\
      .load()

#### Generic Write

In [None]:
df.write\
      .format("jdbc")\
      .option("driver", "<driver name>")\
      .option("url", "jdbc:<databse type>://<ip>:<port>/<db name>")\
      .option("dbtable", "<table name>")\
      .option("user", "<username>")\
      .option("password","<password>")\
      .mode("overwrite")\
      .save()

## Postgres

#### Read

In [None]:
spark.read\
      .format("jdbc")\
      .option("driver", "org.postgresql.Driver")\
      .option("url", "jdbc:postgresql://<ip>:5432/<dbname>")\
      .option("dbtable", "<table>")\
      .option("user", "<username>")\
      .option("password","<password>")\
      .load()

#### Write

In [None]:
df.write\
      .format("jdbc")\
      .option("driver", "org.postgresql.Driver")\
      .option("url", "jdbc:postgresql://localhost:5432/spark_demo_db")\
      .option("dbtable", "my_table")\
      .option("user", "sahil")\
      .option("password","12345")\
      .mode("overwrite")\
      .save()

## MYSQL

#### Read

In [None]:
spark.read\
      .format("jdbc")\
      .option("driver", "com.mysql.jdbc.Driver")\
      .option("url", "jdbc:mysql://<ip>:3306/<dbname>")\
      .option("dbtable", "<table name>")\
      .option("user", " <username>")\
      .option("password","<password>")\
      .load()

#### Write

In [None]:
df.write\
      .format("jdbc")\
      .option("driver", "com.mysql.jdbc.Driver")\
      .option("url", "jdbc:mysql://<ip>:3306/<dbname>")\
      .option("dbtable", "<table name>")\
      .option("user", " <username>")\
      .option("password","<password>")\
      .mode("overwrite")\
      .save()

# NoSQL Databases

|Source| Driver Package|Format Name|Standard Port|
|-----------|----------------|---------|----|
|Cassandra  |com.datastax.spark:spark-cassandra-connector_2.11:2.3.0|org.apache.spark.sql.cassandra|9042
|DynamoDB   |com.amazon.emr:emr-dynamodb-hadoop:4.2.0|



## Cassandra

#### Read

In [None]:
spark.read.format("org.apache.spark.sql.cassandra")\
          .option("spark.cassandra.connection.host","<ip>")\
          .option("spark.cassandra.connection.port","<port>")\
          .option("keyspace","<keyspace name>")\
          .option("table","<table name>")
          .load()

#### Write

In [None]:
spark.write.format("org.apache.spark.sql.cassandra")\
          .option("spark.cassandra.connection.host","<ip>")\
          .option("spark.cassandra.connection.port","<port>")\
          .option("keyspace","<keyspace name>")\
          .option("table","<table name>")
          .save()

## Dynamo DB (TODO)

#### Read

#### Write