In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Baisc').getOrCreate()

In [7]:
df = spark.read.json('dataset/people.json')
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [9]:
df.printSchema() # Prints out the schema in the tree format

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [11]:
df.columns # show the columns of df

['age', 'name']

In [13]:
df.describe() # 

DataFrame[summary: string, age: string, name: string]

In [15]:
df.describe().show() # stats summary

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [19]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [20]:
# A StructField object comprises three fields, name (a string), dataType (a DataType) and nullable (a bool).

data_schema = [StructField('age',IntegerType(),True),
              StructField('name',StringType(),True)]

final_struc = StructType(fields=data_schema)




In [22]:
df = spark.read.json('dataset/people.json',schema=final_struc)


In [23]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [24]:
type(df['age'])

pyspark.sql.column.Column

In [26]:
df.select('age').show() # select particular columns' values

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [27]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [28]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [29]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [31]:
df.withColumn('newage',df['age']).show() # create new columns as newage from age column

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    30|
|  19| Justin|    19|
+----+-------+------+



In [32]:
df.withColumn('newage',df['age']*2).show() # create new columns as newage from age column

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [34]:
df.withColumnRenamed('age','my_new_age').show() # rename the existing columns

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [8]:
"""
A SQLContext can be used create DataFrame, register DataFrame as tables, 
execute SQL over tables, 
cache tables, and read parquet files.
"""

'\nA SQLContext can be used create DataFrame, register DataFrame as tables, \nexecute SQL over tables, \ncache tables, and read parquet files.\n'