Build Pyspark Session

In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.appName('SparkSession1').getOrCreate()

23/05/25 00:48:16 WARN Utils: Your hostname, codespaces-6fb472 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
23/05/25 00:48:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/25 00:48:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.getActiveSession()

Reading Dataset with dataframe 

In [4]:
## Default dataframe read options
## check schema - takes all value as string
df1=spark.read.option('header', 'true').csv('../insurance.csv')

                                                                                

In [5]:
## Check Schema
df1.printSchema()

root
 |-- age: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: string (nullable = true)
 |-- children: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: string (nullable = true)



In [6]:
## Dataframe read option with inferSchema
df2 = spark.read.option('header', 'true').csv('../insurance.csv', inferSchema=True)

In [7]:
df2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [8]:
## Include header and InferSchema together
df3=spark.read.csv('../insurance.csv', header=True, inferSchema=True)

In [9]:
df3.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [10]:
type(df3)

pyspark.sql.dataframe.DataFrame

Note: Dataframe is a data structures inside which you can perform various kind of operation

In [11]:
df3.columns

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [12]:
df3.head(3)

[Row(age=19, sex='female', bmi=27.9, children=0, smoker='yes', region='southwest', charges=16884.924),
 Row(age=18, sex='male', bmi=33.77, children=1, smoker='no', region='southeast', charges=1725.5523),
 Row(age=28, sex='male', bmi=33.0, children=3, smoker='no', region='southeast', charges=4449.462)]

In [13]:
df3.tail(1)

[Row(age=61, sex='female', bmi=29.07, children=0, smoker='yes', region='northwest', charges=29141.3603)]

In [14]:
## Select specific Columns
df3.select('age').show()

+---+
|age|
+---+
| 19|
| 18|
| 28|
| 33|
| 32|
| 31|
| 46|
| 37|
| 37|
| 60|
| 25|
| 62|
| 23|
| 56|
| 27|
| 19|
| 52|
| 23|
| 56|
| 30|
+---+
only showing top 20 rows



In [15]:
df3.select(['age', 'sex']).show()

+---+------+
|age|   sex|
+---+------+
| 19|female|
| 18|  male|
| 28|  male|
| 33|  male|
| 32|  male|
| 31|female|
| 46|female|
| 37|female|
| 37|  male|
| 60|female|
| 25|  male|
| 62|female|
| 23|  male|
| 56|female|
| 27|  male|
| 19|  male|
| 52|female|
| 23|  male|
| 56|  male|
| 30|  male|
+---+------+
only showing top 20 rows



In [16]:
df3['age', 'sex'].show()

+---+------+
|age|   sex|
+---+------+
| 19|female|
| 18|  male|
| 28|  male|
| 33|  male|
| 32|  male|
| 31|female|
| 46|female|
| 37|female|
| 37|  male|
| 60|female|
| 25|  male|
| 62|female|
| 23|  male|
| 56|female|
| 27|  male|
| 19|  male|
| 52|female|
| 23|  male|
| 56|  male|
| 30|  male|
+---+------+
only showing top 20 rows



Check DataTypes

In [17]:
df3.dtypes

[('age', 'int'),
 ('sex', 'string'),
 ('bmi', 'double'),
 ('children', 'int'),
 ('smoker', 'string'),
 ('region', 'string'),
 ('charges', 'double')]

Describe function

In [18]:
df3.describe()

DataFrame[summary: string, age: string, sex: string, bmi: string, children: string, smoker: string, region: string, charges: string]

In [19]:
df3.describe().show()

23/05/25 00:49:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 10:>                                                         (0 + 1) / 1]

+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|summary|               age|   sex|               bmi|         children|smoker|   region|           charges|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|  count|              1338|  1338|              1338|             1338|  1338|     1338|              1338|
|   mean| 39.20702541106129|  null|30.663396860986538|  1.0949177877429|  null|     null|13270.422265141257|
| stddev|14.049960379216147|  null| 6.098186911679012|1.205492739781914|  null|     null|12110.011236693992|
|    min|                18|female|             15.96|                0|    no|northeast|         1121.8739|
|    max|                64|  male|             53.13|                5|   yes|southwest|       63770.42801|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+



                                                                                

Manipulating Columns in Data Frame

In [27]:
## Add Column
df4 = df3.withColumn('age in 5 year', df3['age']+5)

In [25]:
df3.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [26]:
df4.show()

+---+------+------+--------+------+---------+-----------+-------------+
|age|   sex|   bmi|children|smoker|   region|    charges|age in 5 year|
+---+------+------+--------+------+---------+-----------+-------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|           24|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|           23|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|           33|
| 33|  male|22.705|       0|    no|northwest|21984.47061|           38|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|           37|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|           36|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|           51|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|           42|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|           42|
| 60|female| 25.84|       0|    no|northwest|28923.13692|           65|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|       

In [28]:
## Drop Column

In [31]:
df5 = df4.drop('age in 5 year')

In [32]:
df5.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [33]:
## Rename columns
df5.withColumnRenamed('children','dependent').show()

+---+------+------+---------+------+---------+-----------+
|age|   sex|   bmi|dependent|smoker|   region|    charges|
+---+------+------+---------+------+---------+-----------+
| 19|female|  27.9|        0|   yes|southwest|  16884.924|
| 18|  male| 33.77|        1|    no|southeast|  1725.5523|
| 28|  male|  33.0|        3|    no|southeast|   4449.462|
| 33|  male|22.705|        0|    no|northwest|21984.47061|
| 32|  male| 28.88|        0|    no|northwest|  3866.8552|
| 31|female| 25.74|        0|    no|southeast|  3756.6216|
| 46|female| 33.44|        1|    no|southeast|  8240.5896|
| 37|female| 27.74|        3|    no|northwest|  7281.5056|
| 37|  male| 29.83|        2|    no|northeast|  6406.4107|
| 60|female| 25.84|        0|    no|northwest|28923.13692|
| 25|  male| 26.22|        0|    no|northeast|  2721.3208|
| 62|female| 26.29|        0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|        0|    no|southwest|   1826.843|
| 56|female| 39.82|        0|    no|southeast| 11090.717