### Spark DataFrame Basics

In [40]:
import os

In [41]:
from pyspark.sql import SparkSession

In [42]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [43]:
data_file = os.path.join(os.curdir, 'data', 'people.json')

In [44]:
data = spark.read.json(data_file)

In [45]:
data.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [46]:
data.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [47]:
data.describe()

DataFrame[summary: string, age: string, name: string]

In [48]:
data.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [49]:
from pyspark.sql.types import (
    StructField, StringType,
    IntegerType, StructType
)

In [50]:
data_shema = [StructField('age', IntegerType(), True),
             StructField('name', StringType(), True)]

In [51]:
final_structure = StructType(fields=data_shema)

In [52]:
df = spark.read.json(data_file, schema=final_structure)

In [53]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [54]:
age_col = df.select('age') # the way to get age column from the df

In [55]:
age_col

DataFrame[age: int]

In [56]:
age_col.show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [57]:
df.select(['age', 'name'])

DataFrame[age: int, name: string]

In [58]:
# Adding new column
df.withColumn('newage', df['age']*2).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [60]:
# Renaming column
df.withColumnRenamed('age', 'new_age')

DataFrame[new_age: int, name: string]

In [61]:
# create a view for Sql operations
df.createOrReplaceTempView('people')

In [62]:
result = spark.sql("SELECT * from people limit 1")

In [63]:
result.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
+----+-------+



In [64]:
apple_stock_file = os.path.join(os.curdir, 'data', 'appl_stock.csv')

In [65]:
apple_stock_df = spark.read.csv(apple_stock_file,
                               inferSchema=True,
                               header=True)

In [68]:
apple_stock_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [71]:
apple_stock_df.head(3)

[Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039),
 Row(Date='2010-01-05', Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002),
 Row(Date='2010-01-06', Open=214.379993, High=215.23, Low=210.750004, Close=210.969995, Volume=138040000, Adj Close=27.333178000000004)]

In [82]:
# applying filter

# apple_stock_df.filter("Close < 500").select(
#     ['Open', 'Close']).show()

In [None]:
# Another way of applying filter

