In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('test').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.option('header','true').csv('E:/work/datasets/heart.csv')
schema = 'Age INTEGER, Sex STRING, ChestPainType STRING'
df = spark.read.csv('E:/work/datasets/heart.csv',nullValue='NA',schema = schema, header = True)

In [5]:
df.show(5)

+---+---+-------------+
|Age|Sex|ChestPainType|
+---+---+-------------+
| 40|  M|          ATA|
| 49|  F|          NAP|
| 37|  M|          ATA|
| 48|  F|          ASY|
| 54|  M|          NAP|
+---+---+-------------+
only showing top 5 rows



In [6]:
df.columns

['Age', 'Sex', 'ChestPainType']

In [7]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)



In [8]:
type(df)

pyspark.sql.dataframe.DataFrame

In [9]:
import pyspark.sql.functions as sqlf

In [14]:
df.select("Sex").show(1)

+---+
|Sex|
+---+
|  M|
+---+
only showing top 1 row



In [18]:
df.select(sqlf.col("ChestPainType")).show(2)

+-------------+
|ChestPainType|
+-------------+
|          ATA|
|          NAP|
+-------------+
only showing top 2 rows



In [21]:
df.select("Age","ChestPainType").show(1)

+---+-------------+
|Age|ChestPainType|
+---+-------------+
| 40|          ATA|
+---+-------------+
only showing top 1 row



In [23]:
df.select(sqlf.col("ChestPainType"),sqlf.col("Age")).show(2)

+-------------+---+
|ChestPainType|Age|
+-------------+---+
|          ATA| 40|
|          NAP| 49|
+-------------+---+
only showing top 2 rows



In [27]:
df.select(sqlf.expr("Sex as Gender").alias("Sex")).show(3)

+---+
|Sex|
+---+
|  M|
|  F|
|  M|
+---+
only showing top 3 rows



In [31]:
print_df = df.selectExpr("Sex as Gender","Sex")

In [32]:
print_df.show(2)

+------+---+
|Gender|Sex|
+------+---+
|     M|  M|
|     F|  F|
+------+---+
only showing top 2 rows



In [34]:
print_df.printSchema()
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Sex: string (nullable = true)

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)



In [43]:
avg_df = df.selectExpr("avg(Age)","count(distinct(ChestPainType))")
avg_df.show()

+------------------+-----------------------------+
|          avg(Age)|count(DISTINCT ChestPainType)|
+------------------+-----------------------------+
|53.510893246187365|                            4|
+------------------+-----------------------------+



In [44]:
print_df.printSchema()
df.printSchema()
avg_df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Sex: string (nullable = true)

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)

root
 |-- avg(Age): double (nullable = true)
 |-- count(DISTINCT ChestPainType): long (nullable = false)

