In [5]:
import os

import pyspark
from pyspark.sql.types import IntegerType, FloatType
import pyspark.sql.functions as sf

sparkql = pyspark.sql.SparkSession.builder.master('local').getOrCreate()

data_dir = './data'
coffee_data = 'coffee.csv'

schema = 'Date date, Open float, High float, Low float, Close float, Volume float, Currency string'
df = sparkql.read.csv(os.path.join(data_dir, coffee_data), header=True, schema=schema)

print(df.columns)

df.printSchema()
df.show(3)

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Currency']
root
 |-- Date: date (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: float (nullable = true)
 |-- Currency: string (nullable = true)

+----------+------+-----+------+------+------+--------+
|      Date|  Open| High|   Low| Close|Volume|Currency|
+----------+------+-----+------+------+------+--------+
|2000-01-03|122.25|124.0| 116.1| 116.5|6640.0|     USD|
|2000-01-04|116.25|120.5|115.75|116.25|5492.0|     USD|
|2000-01-05| 115.0|121.0| 115.0| 118.6|6165.0|     USD|
+----------+------+-----+------+------+------+--------+
only showing top 3 rows



In [11]:
df = df.withColumn('diff_open_close', sf.col('Open') - sf.col('Close'))
df = df.withColumn('diff_high_low', sf.col('High') - sf.col('Low'))
df = df.withColumn('vol_greater_100', sf.col('Volume') > 100)
df = df.withColumn('abs_val_open_close', sf.abs(sf.col('diff_open_close')))

df.show(10)

+----------+------+------+------+------+-------+--------+---------------+-------------+---------------+------------------+
|      Date|  Open|  High|   Low| Close| Volume|Currency|diff_open_close|diff_high_low|vol_greater_100|abs_val_open_close|
+----------+------+------+------+------+-------+--------+---------------+-------------+---------------+------------------+
|2000-01-03|122.25| 124.0| 116.1| 116.5| 6640.0|     USD|           5.75|    7.9000015|           true|              5.75|
|2000-01-04|116.25| 120.5|115.75|116.25| 5492.0|     USD|            0.0|         4.75|           true|               0.0|
|2000-01-05| 115.0| 121.0| 115.0| 118.6| 6165.0|     USD|     -3.5999985|          6.0|           true|         3.5999985|
|2000-01-06| 119.0| 121.4| 116.5|116.85| 5094.0|     USD|      2.1500015|    4.9000015|           true|         2.1500015|
|2000-01-07|117.25|117.75| 113.8|114.15| 6855.0|     USD|      3.0999985|     3.949997|           true|         3.0999985|
|2000-01-10| 123