In [19]:
# a SparkSession object can perform the most common data processing tasks
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('test').getOrCreate() # will return existing session if one was created before and was not closed

In [3]:
spark

In [27]:
# read csv, all columns will be of type string
df=spark.read.option('header','true').csv('C:/Users/sanedunu/OneDrive - Capgemini/Desktop/Work/pyspark/new_notebook/heart.csv')
df.show()



+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  M|          ATA|      140|        289|        0|    Normal|  172|             N|      0|      Up|           0|
| 49|  F|          NAP|      160|        180|        0|    Normal|  156|             N|      1|    Flat|           1|
| 37|  M|          ATA|      130|        283|        0|        ST|   98|             N|      0|      Up|           0|
| 48|  F|          ASY|      138|        214|        0|    Normal|  108|             Y|    1.5|    Flat|           1|
| 54|  M|          NAP|      150|        195|        0|    Normal|  122|             N|      0|      Up|           0|
| 39|  M|          NAP|      120|        339|        0| 

In [28]:
#the type of the columns
schema='Age INTEGER,Sex STRING,ChestPainType STRING'
df=spark.read.csv('C:/Users/sanedunu/OneDrive - Capgemini/Desktop/Work/pyspark/new_notebook/heart.csv',schema=schema,header=True)
df.show()


+---+---+-------------+
|Age|Sex|ChestPainType|
+---+---+-------------+
| 40|  M|          ATA|
| 49|  F|          NAP|
| 37|  M|          ATA|
| 48|  F|          ASY|
| 54|  M|          NAP|
| 39|  M|          NAP|
| 45|  F|          ATA|
| 54|  M|          ATA|
| 37|  M|          ASY|
| 48|  F|          ATA|
| 37|  F|          NAP|
| 58|  M|          ATA|
| 39|  M|          ATA|
| 49|  M|          ASY|
| 42|  F|          NAP|
| 54|  F|          ATA|
| 38|  M|          ASY|
| 43|  F|          ATA|
| 60|  M|          ASY|
| 36|  M|          ATA|
+---+---+-------------+
only showing top 20 rows



In [32]:
# let PySpark infer the schema
df=spark.read.csv('C:/Users/sanedunu/OneDrive - Capgemini/Desktop/Work/pyspark/new_notebook/heart.csv',inferSchema=True,header=True)
df.show()


+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  M|          ATA|      140|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|
| 49|  F|          NAP|      160|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|
| 37|  M|          ATA|      130|        283|        0|        ST|   98|             N|    0.0|      Up|           0|
| 48|  F|          ASY|      138|        214|        0|    Normal|  108|             Y|    1.5|    Flat|           1|
| 54|  M|          NAP|      150|        195|        0|    Normal|  122|             N|    0.0|      Up|           0|
| 39|  M|          NAP|      120|        339|        0| 

In [31]:
# replace nulls with other value at reading time
#df=spark.read.csv('C:/Users/sanedunu/OneDrive - Capgemini/Desktop/Work/pyspark/new_notebook/heart.csv',nullValue='NA')
#df.show()
# save data
#df.write.format('csv').save('C:/Users/sanedunu/OneDrive - Capgemini/Desktop/Work/pyspark/new_notebook/heart.csv')

# if you want to overwrite the file
#df.write.format('csv').mode('overwrite').save('C:/Users/sanedunu/OneDrive - Capgemini/Desktop/Work/pyspark/new_notebook/heart.csv')

In [33]:
# show head of table
df.show(3)

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  M|          ATA|      140|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|
| 49|  F|          NAP|      160|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|
| 37|  M|          ATA|      130|        283|        0|        ST|   98|             N|    0.0|      Up|           0|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
only showing top 3 rows



In [34]:
# count number of rows
df.count()

918

In [36]:
# show parts of the table
df.select('Age').show(3)
df.select(['Age','Sex']).show(3)

+---+
|Age|
+---+
| 40|
| 49|
| 37|
+---+
only showing top 3 rows

+---+---+
|Age|Sex|
+---+---+
| 40|  M|
| 49|  F|
| 37|  M|
+---+---+
only showing top 3 rows



In [None]:
#collecting

In [39]:
df.cache()
df.collect()

[Row(Age=40, Sex='M', ChestPainType='ATA', RestingBP=140, Cholesterol=289, FastingBS=0, RestingECG='Normal', MaxHR=172, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=49, Sex='F', ChestPainType='NAP', RestingBP=160, Cholesterol=180, FastingBS=0, RestingECG='Normal', MaxHR=156, ExerciseAngina='N', Oldpeak=1.0, ST_Slope='Flat', HeartDisease=1),
 Row(Age=37, Sex='M', ChestPainType='ATA', RestingBP=130, Cholesterol=283, FastingBS=0, RestingECG='ST', MaxHR=98, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=48, Sex='F', ChestPainType='ASY', RestingBP=138, Cholesterol=214, FastingBS=0, RestingECG='Normal', MaxHR=108, ExerciseAngina='Y', Oldpeak=1.5, ST_Slope='Flat', HeartDisease=1),
 Row(Age=54, Sex='M', ChestPainType='NAP', RestingBP=150, Cholesterol=195, FastingBS=0, RestingECG='Normal', MaxHR=122, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=39, Sex='M', ChestPainType='NAP', RestingBP=120, Cholesterol=339, F

In [41]:
# convert PySpark DataFrame to Pandas DataFrame
df_pd=df.toPandas()
# convert it back
spark_df=spark.createDataFrame(df_pd)

In [42]:
# show first three rows as three row objects, which is how spark represents single rows from a table.
df.head(3)

[Row(Age=40, Sex='M', ChestPainType='ATA', RestingBP=140, Cholesterol=289, FastingBS=0, RestingECG='Normal', MaxHR=172, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=49, Sex='F', ChestPainType='NAP', RestingBP=160, Cholesterol=180, FastingBS=0, RestingECG='Normal', MaxHR=156, ExerciseAngina='N', Oldpeak=1.0, ST_Slope='Flat', HeartDisease=1),
 Row(Age=37, Sex='M', ChestPainType='ATA', RestingBP=130, Cholesterol=283, FastingBS=0, RestingECG='ST', MaxHR=98, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0)]

In [43]:
# type os columns
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)
 |-- RestingBP: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FastingBS: integer (nullable = true)
 |-- RestingECG: string (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExerciseAngina: string (nullable = true)
 |-- Oldpeak: double (nullable = true)
 |-- ST_Slope: string (nullable = true)
 |-- HeartDisease: integer (nullable = true)



In [45]:
# column dtypes as list of tuples
df.dtypes

[('Age', 'int'),
 ('Sex', 'string'),
 ('ChestPainType', 'string'),
 ('RestingBP', 'int'),
 ('Cholesterol', 'int'),
 ('FastingBS', 'int'),
 ('RestingECG', 'string'),
 ('MaxHR', 'int'),
 ('ExerciseAngina', 'string'),
 ('Oldpeak', 'double'),
 ('ST_Slope', 'string'),
 ('HeartDisease', 'int')]

In [48]:
# cast a column from one type to other
from pyspark.sql.types import FloatType
df=df.withColumn('Age',df.Age.cast(FloatType()))
df=df.withColumn('RestingBP',df.Age.cast(FloatType()))

In [50]:
# compute summery statistics
df.select(['Age','RestingBP']).describe().show()

+-------+------------------+------------------+
|summary|               Age|         RestingBP|
+-------+------------------+------------------+
|  count|               918|               918|
|   mean|53.510893246187365|53.510893246187365|
| stddev|  9.43261650673202|  9.43261650673202|
|    min|              28.0|              28.0|
|    max|              77.0|              77.0|
+-------+------------------+------------------+



In [51]:
# add a new column or replace existing one
AgeFixed=df['Age']+1 #select alwayes returns a DataFrame object, and we need a column object
df=df.withColumn('AgeFixed',AgeFixed)

In [52]:
df.select(['AgeFixed','Age']).describe().show()

+-------+------------------+------------------+
|summary|          AgeFixed|               Age|
+-------+------------------+------------------+
|  count|               918|               918|
|   mean|54.510893246187365|53.510893246187365|
| stddev|  9.43261650673202|  9.43261650673202|
|    min|              29.0|              28.0|
|    max|              78.0|              77.0|
+-------+------------------+------------------+



In [54]:
# remove columns
df.drop('AgeFixed').show(2) # add df = to get the new DataFrame into a variable

+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|40.0|  M|          ATA|     40.0|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|
|49.0|  F|          NAP|     49.0|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|
+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
only showing top 2 rows



In [61]:
# rename a column
df.withColumnRenamed('Age','age').select('age').show(2)
# to rename more than a single column, i would suggest a loop.
name_pairs=[('Age','age'),('Sex','sex')]
for old_name,new_name in name_pairs:
    df=df.withColumnRenamed(old_name,new_name)
df.show(3)

+----+
| age|
+----+
|40.0|
|49.0|
+----+
only showing top 2 rows

+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
| age|sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|AgeFixed|
+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
|40.0|  M|          ATA|     40.0|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|    41.0|
|49.0|  F|          NAP|     49.0|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|    50.0|
|37.0|  M|          ATA|     37.0|        283|        0|        ST|   98|             N|    0.0|      Up|           0|    38.0|
+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
only showing top 3 rows



In [64]:
df.select(['age','sex']).show(3)

+----+---+
| age|sex|
+----+---+
|40.0|  M|
|49.0|  F|
|37.0|  M|
+----+---+
only showing top 3 rows



In [74]:
# drop all rows that contain any NA
df=df.na.drop()
df.count()
# drop all rows where all values are NA
df=df.na.drop(how='all')
df.count()
# drop all rows where more at least 2 values are NOT NA
df=df.na.drop(thresh=2)
df.count()
# drop all rows where any value at specific columns are NAs.
df=df.na.drop(how='any',subset=['age','sex']) # 'any' is the defult
df.count()

918

In [78]:
# fill missing values in a specific column with a '?'
df=df.na.fill(value='?',subset=['sex'])
# replace NAs with mean of column
from pyspark.ml.feature import Imputer # In statistics, imputation is the process of replacing missing data with substituted values
imptr=Imputer(inputCols=['age','RestingBP'],
             outputCols=['age','RestingBP']).setStrategy('mean') # can also be 'median' 
df=imptr.fit(df).transform(df)

In [80]:
# filter to adults only and calculate mean
df.filter('age>18')
df.where('age > 18')# 'where' is an alias to 'filter'
df.where(df['age']>18)# third option
# add another condition ('&' means and, '|' means or)
df.where((df['age']>18) | (df['ChestPainType'] == 'ATA'))
# take every record where the 'ChestPainType' is NOT 'ATA'
df.filter(~(df['ChestPainType'] == 'ATA'))

DataFrame[age: float, sex: string, ChestPainType: string, RestingBP: float, Cholesterol: int, FastingBS: int, RestingECG: string, MaxHR: int, ExerciseAngina: string, Oldpeak: double, ST_Slope: string, HeartDisease: int, AgeFixed: float]

In [82]:
df.filter('age>18').show(5)

+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
| age|sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|AgeFixed|
+----+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
|40.0|  M|          ATA|     40.0|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|    41.0|
|49.0|  F|          NAP|     49.0|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|    50.0|
|37.0|  M|          ATA|     37.0|        283|        0|        ST|   98|             N|    0.0|      Up|           0|    38.0|
|48.0|  F|          ASY|     48.0|        214|        0|    Normal|  108|             Y|    1.5|    Flat|           1|    49.0|
|54.0|  M|          NAP|     54.0|        195|        0|    Normal|  122|             N|    0.0|      Up

In [84]:
# evaluate a string expression into command
from pyspark.sql.functions import expr
exp = 'age + 0.2 * AgeFixed'
df.withColumn('new_col', expr(exp)).select('new_col').show(3)

+-------+
|new_col|
+-------+
|   48.2|
|   59.0|
|   44.6|
+-------+
only showing top 3 rows



In [85]:
# group by age
disease_by_age = df.groupby('age').mean().select(['age','avg(HeartDisease)'])
# sort values in desnding order
from pyspark.sql.functions import desc
disease_by_age.orderBy(desc("age")).show(5)

+----+------------------+
| age| avg(HeartDisease)|
+----+------------------+
|77.0|               1.0|
|76.0|               0.5|
|75.0|0.6666666666666666|
|74.0|0.7142857142857143|
|73.0|               1.0|
+----+------------------+
only showing top 5 rows



In [86]:
from pyspark.sql.functions import asc
disease_by_age = df.groupby('age').mean().select(['age','avg(HeartDisease)'])
disease_by_age.orderBy(desc("age")).show(3)

+----+------------------+
| age| avg(HeartDisease)|
+----+------------------+
|77.0|               1.0|
|76.0|               0.5|
|75.0|0.6666666666666666|
+----+------------------+
only showing top 3 rows



In [87]:
# aggregate to get several statistics for several columns
# the available aggregate functions are avg, max, min, sum, count
from pyspark.sql import functions as F
df.agg(F.min(df['age']),F.max(df['age']),F.avg(df['sex'])).show()

+--------+--------+--------+
|min(age)|max(age)|avg(sex)|
+--------+--------+--------+
|    28.0|    77.0|    null|
+--------+--------+--------+



In [88]:
df.groupby('HeartDisease').agg(F.min(df['age']),F.avg(df['sex'])).show()

+------------+--------+--------+
|HeartDisease|min(age)|avg(sex)|
+------------+--------+--------+
|           1|    31.0|    null|
|           0|    28.0|    null|
+------------+--------+--------+



In [89]:
# run an SQL query on the data
df.createOrReplaceTempView("df") # tell PySpark how the table will be called in the SQL query
spark.sql("""SELECT sex from df""").show(2)

# we also choose columns using SQL sytnx, with a command that combins '.select()' and '.sql()'
df.selectExpr("age >= 40 as older", "age").show(2)

+---+
|sex|
+---+
|  M|
|  F|
+---+
only showing top 2 rows

+-----+----+
|older| age|
+-----+----+
| true|40.0|
| true|49.0|
+-----+----+
only showing top 2 rows



In [91]:
df.groupby('age').pivot('sex', ("M", "F")).count().show(3)

+----+---+---+
| age|  M|  F|
+----+---+---+
|64.0| 16|  6|
|47.0| 15|  4|
|58.0| 35|  7|
+----+---+---+
only showing top 3 rows



In [92]:
# pivot - expensive operation
df.selectExpr("age >= 40 as older", "age",'sex').groupBy("sex")\
                    .pivot("older", ("true", "false")).count().show()

+---+----+-----+
|sex|true|false|
+---+----+-----+
|  F| 174|   19|
|  M| 664|   61|
+---+----+-----+



In [93]:
df.select(['age','MaxHR','Cholesterol']).show(4)

+----+-----+-----------+
| age|MaxHR|Cholesterol|
+----+-----+-----------+
|40.0|  172|        289|
|49.0|  156|        180|
|37.0|   98|        283|
|48.0|  108|        214|
+----+-----+-----------+
only showing top 4 rows

