In [4]:
import os
import sys
os.environ['SPARK_HOME']='/usr/lib/spark'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.7-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [5]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [6]:
spark = SparkSession.builder.appName('SparkJDBC') \
.config('spark.warehouse.dir','/user/hive/warehouse') \
.config('spark.driver.extraClassPath', 
        '/usr/share/java/mysql-connector-java.jar') \
.config('spark.executor.extraClassPath', 
        '/usr/share/java/mysql-connector-java.jar') \
.enableHiveSupport().getOrCreate()

In [7]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'2.3.0'

In [8]:
sc = spark.sparkContext

In [9]:
prop = {'user': 'root', 'password': 'abcd', 'driver': 'com.mysql.jdbc.Driver'}
url = 'jdbc:mysql://localhost:3306/testdb'
mrdf = spark.read.format('jdbc').option('url', url).option('dbtable', 'simptable') \
.option('user', 'root').option('password', 'abcd') \
.option('driver', 'com.mysql.jdbc.Driver').load()
mrdf.show()

+--------+--------+---+
|   fname|   lname|age|
+--------+--------+---+
|    anil|  kapoor| 54|
|  sanjay|    dutt| 56|
|   arjun|  kappor| 28|
|  ranbir|  kappor| 31|
|  deepka|padukone| 29|
|shahrukh|    khan| 51|
| hrithik|  roshan| 37|
+--------+--------+---+



In [10]:
mrdf.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: integer (nullable = true)



In [11]:
mrdf.rdd.getNumPartitions()

1

In [12]:
mrdf.groupBy('lname').count().show()

+--------+-----+
|   lname|count|
+--------+-----+
|padukone|    1|
|  roshan|    1|
|  kappor|    2|
|    khan|    1|
|  kapoor|    1|
|    dutt|    1|
+--------+-----+



In [13]:
mrdf_age_lim = spark.read.format('jdbc').option('url', url).option('dbtable', 'simptable') \
.option('user', 'root').option('password', 'abcd') \
.option('driver', 'com.mysql.jdbc.Driver') \
.option('partitionColumn', 'age').option('lowerBound', 0).option('upperBound', 60) \
.option('numPartitions', 4).load()
mrdf_age_lim.show()

+--------+--------+---+
|   fname|   lname|age|
+--------+--------+---+
|   arjun|  kappor| 28|
|  deepka|padukone| 29|
|  ranbir|  kappor| 31|
| hrithik|  roshan| 37|
|    anil|  kapoor| 54|
|  sanjay|    dutt| 56|
|shahrukh|    khan| 51|
+--------+--------+---+



In [14]:
mrdf_age_lim.rdd.getNumPartitions()

4

In [15]:
actor_query = "(select fname, lname from simptable where age between  30 and 50) pdtbl"
push_down_df = spark.read.jdbc(url=url, table=actor_query, properties=prop)
push_down_df.show()

+-------+------+
|  fname| lname|
+-------+------+
| ranbir|kappor|
|hrithik|roshan|
+-------+------+



In [26]:
push_down_df.explain()

== Physical Plan ==
*(1) Scan JDBCRelation((select fname, lname from simptable where age between  30 and 50) pdtbl) [numPartitions=1] [fname#108,lname#109] PushedFilters: [], ReadSchema: struct<fname:string,lname:string>


In [16]:
df_pushdown = spark.read.jdbc(table="simptable", url=url, properties=prop).where('age between 30 and 50')

In [17]:
df_pushdown.explain()

== Physical Plan ==
*(1) Scan JDBCRelation(simptable) [numPartitions=1] [fname#63,lname#64,age#65] PushedFilters: [*IsNotNull(age), *GreaterThanOrEqual(age,30), *LessThanOrEqual(age,50)], ReadSchema: struct<fname:string,lname:string,age:int>


In [18]:
df_pushdown.show()

+-------+------+---+
|  fname| lname|age|
+-------+------+---+
| ranbir|kappor| 31|
|hrithik|roshan| 37|
+-------+------+---+



In [38]:
spark.read.format('jdbc').option('url', url).option('dbtable', 'nstable') \
.option('user', 'root').option('password', 'cloudera') \
.option('driver', 'com.mysql.jdbc.Driver').load().count()

126

In [39]:
mrdf.write.jdbc(url=url, mode='append', table='nstable', properties=prop)