# Table of Contents

## Input / Reading Data

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better

Sample data can be obtained from [here](https://jacobceles.github.io/knowledge_repo/colab_and_pyspark/cars.csv).

#### Read csv file by infering or guessing schema

In [2]:
df = spark.read.csv('data/cars.csv', header=True, sep=";")

In [3]:
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0| 3449.|        10.5|   70|    US|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



In [4]:
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: string (nullable = true)
 |-- Cylinders: string (nullable = true)
 |-- Displacement: string (nullable = true)
 |-- Horsepower: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- Acceleration: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Origin: string (nullable = true)



#### Explicitly defining schema

In [5]:
from pyspark.sql.types import *

In [6]:
schema = StructType([
    StructField("Car", StringType(),True),
    StructField("MPG", DoubleType(),True),
    StructField("Cylinders", IntegerType(),True),
    StructField("Displacement", DoubleType(), True),
    StructField("Horsepower", DoubleType(), True),
    StructField("Weight", DoubleType(), True),
    StructField("Acceleration", IntegerType(), True),
    StructField("Model", IntegerType(), True),
    StructField("Origin", StringType(), True)
  ])

In [7]:
df = spark.read.csv('data/cars.csv', header=True, sep=";", schema=schema)

In [8]:
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Acceleration: integer (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [9]:
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        null|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        null|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        null|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        null|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0|3449.0|        null|   70|    US|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



## Data Transformations

#### Split String into Array Tokens and Make New Columns From Them

In [10]:
from pyspark.sql.functions import split, col

In [11]:
df = df.withColumn(
    "Make", split(col("Car"), " ")
    .getItem(0)
).withColumn(
    "Model", split(col("Car"), " ")
    .getItem(1)
)

In [12]:
df.show(5, truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+---------+------+---------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model    |Origin|Make     |
+-------------------------+----+---------+------------+----------+------+------------+---------+------+---------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504.0|null        |Chevelle |US    |Chevrolet|
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693.0|null        |Skylark  |US    |Buick    |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436.0|null        |Satellite|US    |Plymouth |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433.0|null        |Rebel    |US    |AMC      |
|Ford Torino              |17.0|8        |302.0       |140.0     |3449.0|null        |Torino   |US    |Ford     |
+-------------------------+----+---------+------------+----------+------+------------+--

#### Order by Single Column

In [13]:
df.orderBy('MPG', ascending=False).show(10, truncate=False)

+-------------------------------+----+---------+------------+----------+------+------------+------+------+----------+
|Car                            |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model |Origin|Make      |
+-------------------------------+----+---------+------------+----------+------+------------+------+------+----------+
|Mazda GLC                      |46.6|4        |86.0        |65.0      |2110.0|null        |GLC   |Japan |Mazda     |
|Honda Civic 1500 gl            |44.6|4        |91.0        |67.0      |1850.0|null        |Civic |Japan |Honda     |
|Volkswagen Rabbit C (Diesel)   |44.3|4        |90.0        |48.0      |2085.0|null        |Rabbit|Europe|Volkswagen|
|Volkswagen Pickup              |44.0|4        |97.0        |52.0      |2130.0|null        |Pickup|Europe|Volkswagen|
|Volkswagen Dasher (diesel)     |43.4|4        |90.0        |48.0      |2335.0|null        |Dasher|Europe|Volkswagen|
|Volkswagen Rabbit Custom Diesel|43.1|4        |90.0    

#### Order By Multiple Columns

In [14]:
df.orderBy(['MPG','Displacement'], ascending=[False, True]).show(10, truncate=False)

+-------------------------------+----+---------+------------+----------+------+------------+------+------+----------+
|Car                            |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model |Origin|Make      |
+-------------------------------+----+---------+------------+----------+------+------------+------+------+----------+
|Mazda GLC                      |46.6|4        |86.0        |65.0      |2110.0|null        |GLC   |Japan |Mazda     |
|Honda Civic 1500 gl            |44.6|4        |91.0        |67.0      |1850.0|null        |Civic |Japan |Honda     |
|Volkswagen Rabbit C (Diesel)   |44.3|4        |90.0        |48.0      |2085.0|null        |Rabbit|Europe|Volkswagen|
|Volkswagen Pickup              |44.0|4        |97.0        |52.0      |2130.0|null        |Pickup|Europe|Volkswagen|
|Volkswagen Dasher (diesel)     |43.4|4        |90.0        |48.0      |2335.0|null        |Dasher|Europe|Volkswagen|
|Volkswagen Rabbit Custom Diesel|43.1|4        |90.0    

#### Re-Arrange Columns using `select()`

In [15]:
df.select('Make', 'Model', 'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Origin')

Make,Model,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Origin
Chevrolet,Chevelle,18.0,8,307.0,130.0,3504.0,,US
Buick,Skylark,15.0,8,350.0,165.0,3693.0,,US
Plymouth,Satellite,18.0,8,318.0,150.0,3436.0,,US
AMC,Rebel,16.0,8,304.0,150.0,3433.0,,US
Ford,Torino,17.0,8,302.0,140.0,3449.0,,US
Ford,Galaxie,15.0,8,429.0,198.0,4341.0,,US
Chevrolet,Impala,14.0,8,454.0,220.0,4354.0,,US
Plymouth,Fury,14.0,8,440.0,215.0,4312.0,,US
Pontiac,Catalina,14.0,8,455.0,225.0,4425.0,,US
AMC,Ambassador,15.0,8,390.0,190.0,3850.0,,US


#### Using UDF to Create New Column

In [16]:
data = [
    ("bacon", 4.0),
    ("pulled pork", 3.0),
    ("bacon", 12.0),
    ("pastrami", 6.0),
    ("corned beef", 7.5),
    ("bacon", 8.0),
    ("pastrami", 3.0),
    ("honey ham", 5.0),
    ("nova lox", 6.0),
  ]

In [17]:
schema = StructType([
    StructField("Food", StringType(),True),
    StructField("Ounces", DoubleType(),True),
  ])

In [18]:
df_food = spark.createDataFrame(data=data,schema=schema)

In [19]:
df_food.show()

+-----------+------+
|       Food|Ounces|
+-----------+------+
|      bacon|   4.0|
|pulled pork|   3.0|
|      bacon|  12.0|
|   pastrami|   6.0|
|corned beef|   7.5|
|      bacon|   8.0|
|   pastrami|   3.0|
|  honey ham|   5.0|
|   nova lox|   6.0|
+-----------+------+



In [20]:
from pyspark.sql.functions import udf

In [21]:
def food2animal(column):
    if column == 'bacon':
        return 'pig'
    elif column == 'pulled pork':
        return 'pig'
    elif column == 'pastrami':
        return 'cow'
    elif column == 'corned beef':
        return 'cow'
    elif column == 'honey ham':
        return 'pig'
    else:
        return 'salmon'

In [22]:
food2animal_udf = udf(food2animal, StringType())

In [23]:
df_food_with_animal = df_food.withColumn("animal", food2animal_udf("Food"))

In [24]:
df_food_with_animal.show()

+-----------+------+------+
|       Food|Ounces|animal|
+-----------+------+------+
|      bacon|   4.0|   pig|
|pulled pork|   3.0|   pig|
|      bacon|  12.0|   pig|
|   pastrami|   6.0|   cow|
|corned beef|   7.5|   cow|
|      bacon|   8.0|   pig|
|   pastrami|   3.0|   cow|
|  honey ham|   5.0|   pig|
|   nova lox|   6.0|salmon|
+-----------+------+------+



#### Using `when` to duplicate IF-ELSE Logic To Create New Column

In [25]:
from pyspark.sql.functions import when

In [26]:
data = [
    ("bacon", 4.0),
    ("pulled pork", 3.0),
    ("bacon", 12.0),
    ("pastrami", 6.0),
    ("corned beef", 7.5),
    ("bacon", 8.0),
    ("pastrami", 3.0),
    ("honey ham", 5.0),
    ("nova lox", 6.0),
  ]

schema = StructType([
    StructField("Food", StringType(),True),
    StructField("Ounces", DoubleType(),True),
  ])

df_food = spark.createDataFrame(data=data,schema=schema)

In [27]:
df_food.show()

+-----------+------+
|       Food|Ounces|
+-----------+------+
|      bacon|   4.0|
|pulled pork|   3.0|
|      bacon|  12.0|
|   pastrami|   6.0|
|corned beef|   7.5|
|      bacon|   8.0|
|   pastrami|   3.0|
|  honey ham|   5.0|
|   nova lox|   6.0|
+-----------+------+



In [28]:
df_food.withColumn(
    'Animal',
    when(col("Food") == 'bacon', 'pork')
    .when(col("Food") == 'pulled pork', 'pork')
    .when(col("Food") == 'pastrami', 'cow')
    .when(col("Food") == 'corned beef', 'cow')
    .when(col("Food") == 'honey ham', 'pig')
    .otherwise('salmon')
)

Food,Ounces,Animal
bacon,4.0,pork
pulled pork,3.0,pork
bacon,12.0,pork
pastrami,6.0,cow
corned beef,7.5,cow
bacon,8.0,pork
pastrami,3.0,cow
honey ham,5.0,pig
nova lox,6.0,salmon


#### Using sql to create new column

In [29]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

In [30]:
data = [
    ("bacon", 4.0),
    ("pulled pork", 3.0),
    ("bacon", 12.0),
    ("pastrami", 6.0),
    ("corned beef", 7.5),
    ("bacon", 8.0),
    ("pastrami", 3.0),
    ("honey ham", 5.0),
    ("nova lox", 6.0),
  ]

schema = StructType([
    StructField("Food", StringType(),True),
    StructField("Ounces", DoubleType(),True),
  ])

df_food = spark.createDataFrame(data=data,schema=schema)

In [31]:
df_food.registerTempTable('food_table')
newDF = sqlContext.sql(
    '''
    select *,
    case
    when Food = 'bacon' then 'pig'
    when Food = 'pulled pork' then 'pig'
    when Food = 'pastrami' then 'cow'
    when Food = 'corned beef' then 'cow'
    when Food = 'honey ham' then 'pig'
    else 'salmon' end as Animal from food_table
    '''
)
newDF.show()

+-----------+------+------+
|       Food|Ounces|Animal|
+-----------+------+------+
|      bacon|   4.0|   pig|
|pulled pork|   3.0|   pig|
|      bacon|  12.0|   pig|
|   pastrami|   6.0|   cow|
|corned beef|   7.5|   cow|
|      bacon|   8.0|   pig|
|   pastrami|   3.0|   cow|
|  honey ham|   5.0|   pig|
|   nova lox|   6.0|salmon|
+-----------+------+------+



## Data Summarizations

In [32]:
df.show(truncate=False)

+--------------------------------+----+---------+------------+----------+------+------------+----------+------+---------+
|Car                             |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model     |Origin|Make     |
+--------------------------------+----+---------+------------+----------+------+------------+----------+------+---------+
|Chevrolet Chevelle Malibu       |18.0|8        |307.0       |130.0     |3504.0|null        |Chevelle  |US    |Chevrolet|
|Buick Skylark 320               |15.0|8        |350.0       |165.0     |3693.0|null        |Skylark   |US    |Buick    |
|Plymouth Satellite              |18.0|8        |318.0       |150.0     |3436.0|null        |Satellite |US    |Plymouth |
|AMC Rebel SST                   |16.0|8        |304.0       |150.0     |3433.0|null        |Rebel     |US    |AMC      |
|Ford Torino                     |17.0|8        |302.0       |140.0     |3449.0|null        |Torino    |US    |Ford     |
|Ford Galaxie 500       

#### Count of rows

In [33]:
df.count()

406

#### Counts by Groups within a Single Column

In [34]:
df.groupBy('Origin').count().withColumnRenamed('count', 'Count')

Origin,Count
Europe,73
US,254
Japan,79


#### Aggregations

In [35]:
from pyspark.sql.functions import mean 

In [36]:
df.groupBy(
    "Origin"
).agg(
    mean('MPG')
).show()

+------+------------------+
|Origin|          avg(MPG)|
+------+------------------+
|Europe|26.745205479452057|
|    US|19.688188976377948|
| Japan|30.450632911392397|
+------+------------------+



## Joining / Merging

In [37]:
df_counts = df.groupBy('Origin').count().withColumnRenamed('count', 'Count')

In [38]:
df_avgs = df.groupBy(
    "Origin"
).agg(
    mean('MPG')
)

In [39]:
df_counts.show()

+------+-----+
|Origin|Count|
+------+-----+
|Europe|   73|
|    US|  254|
| Japan|   79|
+------+-----+



In [40]:
df_avgs = df_avgs.withColumnRenamed('avg(MPG)', 'Avg')

In [41]:
df_avgs.show()

+------+------------------+
|Origin|               Avg|
+------+------------------+
|Europe|26.745205479452057|
|    US|19.688188976377948|
| Japan|30.450632911392397|
+------+------------------+



In [42]:
df_counts.join(df_avgs, df_counts.Origin == df_avgs.Origin, 'inner').select(df_counts.Origin, df_counts.Count, df_avgs.Avg)

Origin,Count,Avg
Europe,73,26.745205479452057
US,254,19.688188976377948
Japan,79,30.4506329113924


As you can see, we have done an inner join between two dataframes. The following joins are supported by PySpark:

- inner (default)
- cross
- outer
- full
- full_outer
- left
- left_outer
- right
- right_outer
- left_semi
- left_anti

## Filtering

In [43]:
df.show(truncate=False)

+--------------------------------+----+---------+------------+----------+------+------------+----------+------+---------+
|Car                             |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model     |Origin|Make     |
+--------------------------------+----+---------+------------+----------+------+------------+----------+------+---------+
|Chevrolet Chevelle Malibu       |18.0|8        |307.0       |130.0     |3504.0|null        |Chevelle  |US    |Chevrolet|
|Buick Skylark 320               |15.0|8        |350.0       |165.0     |3693.0|null        |Skylark   |US    |Buick    |
|Plymouth Satellite              |18.0|8        |318.0       |150.0     |3436.0|null        |Satellite |US    |Plymouth |
|AMC Rebel SST                   |16.0|8        |304.0       |150.0     |3433.0|null        |Rebel     |US    |AMC      |
|Ford Torino                     |17.0|8        |302.0       |140.0     |3449.0|null        |Torino    |US    |Ford     |
|Ford Galaxie 500       

In [44]:
df.filter(col('Make')=='Chevrolet').show(5)

+--------------------+----+---------+------------+----------+------+------------+--------+------+---------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|   Model|Origin|     Make|
+--------------------+----+---------+------------+----------+------+------------+--------+------+---------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        null|Chevelle|    US|Chevrolet|
|    Chevrolet Impala|14.0|        8|       454.0|     220.0|4354.0|        null|  Impala|    US|Chevrolet|
|Chevrolet Chevell...| 0.0|        8|       350.0|     165.0|4142.0|        null|Chevelle|    US|Chevrolet|
|Chevrolet Monte C...|15.0|        8|       400.0|     150.0|3761.0|        null|   Monte|    US|Chevrolet|
| Chevrolet Vega 2300|28.0|        4|       140.0|      90.0|2264.0|        null|    Vega|    US|Chevrolet|
+--------------------+----+---------+------------+----------+------+------------+--------+------+---------+
only showing top 5 rows



In [45]:
df.filter(col('Make').contains('Chev')).show(5)

+--------------------+----+---------+------------+----------+------+------------+--------+------+---------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|   Model|Origin|     Make|
+--------------------+----+---------+------------+----------+------+------------+--------+------+---------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        null|Chevelle|    US|Chevrolet|
|    Chevrolet Impala|14.0|        8|       454.0|     220.0|4354.0|        null|  Impala|    US|Chevrolet|
|Chevrolet Chevell...| 0.0|        8|       350.0|     165.0|4142.0|        null|Chevelle|    US|Chevrolet|
|Chevrolet Monte C...|15.0|        8|       400.0|     150.0|3761.0|        null|   Monte|    US|Chevrolet|
|           Chevy C20|10.0|        8|       307.0|     200.0|4376.0|        null|     C20|    US|    Chevy|
+--------------------+----+---------+------------+----------+------+------------+--------+------+---------+
only showing top 5 rows



In [46]:
df.filter(
    (col('Make').contains('Chev')) &
    (col('Cylinders') < 8)
).show()

+--------------------+----+---------+------------+----------+------+------------+--------+------+----------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|   Model|Origin|      Make|
+--------------------+----+---------+------------+----------+------+------------+--------+------+----------+
| Chevrolet Vega 2300|28.0|        4|       140.0|      90.0|2264.0|        null|    Vega|    US| Chevrolet|
|Chevrolet Chevell...|17.0|        6|       250.0|     100.0|3329.0|        null|Chevelle|    US| Chevrolet|
| Chevrolet Vega (sw)|22.0|        4|       140.0|      72.0|2408.0|        null|    Vega|    US| Chevrolet|
|      Chevrolet Vega|20.0|        4|       140.0|      90.0|2408.0|        null|    Vega|    US| Chevrolet|
|Chevrolet Nova Cu...|16.0|        6|       250.0|     100.0|3278.0|        null|    Nova|    US| Chevrolet|
|      Chevrolet Vega|21.0|        4|       140.0|      72.0|2401.0|        null|    Vega|    US| Chevrolet|
|      Chevrolet No

In [47]:
spark.stop()

## Connecting to Relational Databases (JDBC)

[Link](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/from_to_dbms.html) to their documentation

#### PostgreSQL

In [1]:
import configparser
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Postgres")\
    .config("spark.jars", "C:\\Users\\some_user\\drivers\\jdbc\\postgresql\\postgresql-42.2.23.jar")\
    .getOrCreate()

In [3]:
config_file = os.getenv("CONFIG_PATH")

In [4]:
config = configparser.ConfigParser()
try:
    config.read(config_file)
except ConfigFileNotFound:
    print("config.ini file not found")

In [5]:
# Read in the Postgresql database credentials for DSN-less connection
PG_HOST = config["sc_health_postgres"]["HOST"]
PG_PORT = config["sc_health_postgres"]["PORT"]
PG_DB = config["sc_health_postgres"]["DB"]
PG_USER = config["sc_health_postgres"]["USER"]
PG_PWD = config["sc_health_postgres"]["PWD"]

In [6]:
url = f'jdbc:postgresql://{PG_HOST}:{PG_PORT}/{PG_DB}'
driver = 'org.postgresql.Driver'

In [7]:
query = "SELECT CURRENT_DATE"

In [8]:
jdbcDF = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", PG_USER) \
    .option("password", PG_PWD) \
    .option("query", query) \
    .load()

In [9]:
jdbcDF.show()

+------------+
|current_date|
+------------+
|  2021-08-22|
+------------+



In [10]:
spark.stop()

#### IBM DB2 LUW

In [11]:
spark = SparkSession.builder.master("local[*]").appName("SCOODS")\
    .config("spark.jars", "C:\\Users\\some_user\\drivers\\jdbc\\mainframe\\db2jcc.jar")\
    .getOrCreate()

In [12]:
config_file = os.getenv("CONFIG_PATH")

In [13]:
config = configparser.ConfigParser()
try:
    config.read(config_file)
except ConfigFileNotFound:
    print("config.ini file not found")

In [14]:
# Read in the Postgresql database credentials for DSN-less connection
SCOODS_HOST = config["scoods_prod"]["HOST"]
SCOODS_PORT = config["scoods_prod"]["PORT"]
SCOODS_DB = config["scoods_prod"]["DB"]
SCOODS_USER = config["scoods_prod"]["USER"]
SCOODS_PWD = config["scoods_prod"]["PWD"]

In [15]:
url = f'jdbc:db2://{SCOODS_HOST}:{SCOODS_PORT}/{SCOODS_DB}:useJDBC4ColumnNameAndLabelSemantics=false;'
driver = 'com.ibm.db2.jcc.DB2Driver'

In [16]:
query = "SELECT CURRENT TIMESTAMP as DATETIME_NOW FROM SYSIBM.SYSDUMMY1"

In [17]:
jdbcDF = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", SCOODS_USER) \
    .option("password", SCOODS_PWD) \
    .option("query", query) \
    .load()

In [18]:
jdbcDF.show(truncate=False)

+--------------------------+
|DATETIME_NOW              |
+--------------------------+
|2021-08-21 21:03:38.029453|
+--------------------------+



In [19]:
spark.stop()

#### Microsoft SQL Server

In [20]:
spark = SparkSession.builder.master("local[*]").appName("NAPS")\
    .config("spark.jars", "C:\\Users\\some_user\\drivers\\mssql_jdbc\\mssql-jdbc-9.4.0.jre8.jar")\
    .getOrCreate()

In [21]:
config_file = os.getenv("CONFIG_PATH")

In [22]:
config = configparser.ConfigParser()
try:
    config.read(config_file)
except ConfigFileNotFound:
    print("config.ini file not found")

In [23]:
# Read in the Postgresql database credentials for DSN-less connection
NAPS_HOST = config["naps"]["HOST"]
NAPS_PORT = config["naps"]["PORT"]
NAPS_DB = config["naps"]["DB"]

In [24]:
url = f'jdbc:sqlserver://{NAPS_HOST}:{NAPS_PORT};databaseName={NAPS_DB};integratedSecurity=true'
driver = 'com.microsoft.sqlserver.jdbc.SQLServerDriver'

In [25]:
query = "SELECT * from DimCarrier"

In [26]:
jdbcDF = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("query", query) \
    .load()

In [27]:
jdbcDF.show(truncate=False)

+---------+-------------------------+---------------------+-----------------------+-----------------------+-------+
|CarrierSk|CarrierNm                |TransportationModeTxt|UpdateTmstmp           |InsertTmstmp           |BatchId|
+---------+-------------------------+---------------------+-----------------------+-----------------------+-------+
|0        |FEDEX LTL                |LTL                  |2017-02-10 11:07:01.253|2017-02-10 11:07:01.253|1521   |
|1        |M                        |TL                   |2017-02-10 11:07:01.253|2017-02-10 11:07:01.253|1521   |
|2        |Amcan                    |TL                   |2017-02-10 11:07:01.253|2017-02-10 11:07:01.253|1521   |
|3        |WGBL                     |TL                   |2017-02-10 11:07:01.253|2017-02-10 11:07:01.253|1521   |
|4        |No                       |LTL                  |2017-02-10 11:07:01.253|2017-02-10 11:07:01.253|1521   |
|5        |Honda - Midwest Logistics|LTL                  |2017-02-10 11

In [28]:
spark.stop()

#### IBM DB2 z/OS

Unfortunately, the IBM Java JRE does not work with Spark 3.x and thus, we will not be able to connect to mainframe DB2 z/OS platform.

#### Connecting to Snowflake

[Link](https://docs.snowflake.com/en/user-guide/spark-connector-install.html) to Snowflake's documentation on working with the PySpark connector

Currently have PySpark 3.2.0 installed, but Snowflake does not yet support 3.2.0.  Therefore, the code below has not yet been tested and is currently being used as a placeholder.

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [None]:
spark = SparkSession.builder.master("local[*]").appName("Snowflake_JDBC")\
    .config("spark.jars", "C:\Path_To_Snowflake_JDBC.jar")\
    .getOrCreate()

In [None]:
# Snowflake connection parameters
sfparams = {
  "sfURL" : "<account_identifier>.snowflakecomputing.com",
  "sfUser" : "<user_name>",
  "sfPassword" : "<password>",
  "sfDatabase" : "<database>",
  "sfSchema" : "<schema>",
  "sfWarehouse" : "<warehouse>"
}

In [None]:
#read full table
df = spark.read.format(“snowflake”) \
  .options(**sfparams) \
  .option("dbtable",  "Employee") \
  .load()

#run custom query
df = spark.read.format(“snowflake”) \
  .options(**sfparams) \
  .option("query",  "SELECT * FROM Employee") \
  .load()