### Creating DataFrames

In [1]:
import findspark
findspark.init('/home/spark/spark-2.4.5-bin-hadoop2.6/')

In [2]:
#We need to start SparkSession
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [5]:
#We need to get the data from a file
#For now, we take the dataset from Spark's example
df = spark.read.json('people.json')

In [6]:
#Showing the data
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [7]:
#Schema
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [8]:
#columns
df.columns

['age', 'name']

In [9]:
#Description
df.describe()

DataFrame[summary: string, age: string, name: string]

In [10]:
#If we require a structure that has to be defined by ourself,
#Spark has all the tools for this
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [23]:
#Create the list of Structure fields
data_schema = [StructField("age", IntegerType(), True), StructField("name", StringType(), True)]
#StructField(field_name, type_field, Nullable)

In [24]:
final_structure = StructType(fields=data_schema)

In [25]:
df = spark.read.json('people.json', schema=final_structure)

In [26]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



### Grabbing Data

In [27]:
df['Age']

Column<b'Age'>

In [28]:
#Checking the type of the df['Age']
type(df['Age'])

pyspark.sql.column.Column

In [29]:
df.select('age')

DataFrame[age: int]

In [30]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [31]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [34]:
df.head(2) #Returns the list of row objects

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [38]:
#Grabbing Multiple Columns
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [39]:
#Creating New columns
df.withColumn('newage', df['age']).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    30|
|  19| Justin|    19|
+----+-------+------+



In [40]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [43]:
#Renaming column
df.withColumnRenamed('age', 'Age').show()

+----+-------+
| Age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



### Using SQL

#### Spark is really super because of its amazing feature. Spark DataFrames allows us to use SQL queries inside directly

##### To use SQL queries directly within a dataframe, we need to register it to a temporary view

In [44]:
#Register the dataframe as a SQL temporary view
df.createOrReplaceTempView("people")

In [45]:
sql_results = spark.sql("SELECT * FROM people")

In [47]:
sql_results.columns

['age', 'name']

In [48]:
sql_results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [50]:
spark.sql("SELECT * FROM people WHERE age>20").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

