<h1 style="margin: auto; font-weight: bold; padding: 30px 30px 0px 30px; color:#000;" align="center">Cleaning and Exploring Big Data using PySpark</h1>
<p style="width: 100%; text-align: center; margin: 0px; padding: 0px 0px 30px 0px; font-size: 24px; color:#000;" align="center">| Spark - PySpark practice |</p>


## 1. Libraries

In [92]:
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when

## 2. Set up SparkSession

In [2]:
spark = (
    SparkSession
    .builder
    .master("local[1]")
    .appName("PoC")
    .getOrCreate()
)

sc = spark.sparkContext

spark

## 2. Initial testing

In [3]:
rdd_test = sc.parallelize(range(1000))

In [4]:
rdd_test.takeSample(False, 5)

[318, 446, 911, 552, 592]

In [5]:
print(type(rdd_test))

<class 'pyspark.rdd.PipelinedRDD'>


In [6]:
rdd = sc.parallelize([('C',85,76,87,91), ('B',85,76,87,91), ("A", 85,78,96,92), ("A", 92,76,89,96)], 4)

In [7]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


## 3. Using createDataframe()

In [8]:
data = [
    ("James", "", "Smith", "1991-04-01", "M", 3000),
    ("Michael", "Rose", "", "2000-05-19", "M", 4000),
    ("Robert", "", "Williams", "1978-09-05", "M", 4000),
    ("Maria", "Anne", "Jones", "1967-12-01", "F", 4000),
    ("Jen", "Mary", "Brown", "1980-02-17", "F", -1)
]

In [9]:
COLUMNS = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema=COLUMNS)

In [10]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [11]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [12]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


let's transform the dataframe to RDD

In [13]:
rdd_df = df.rdd

In [14]:
print(type(rdd_df))

<class 'pyspark.rdd.RDD'>


In [15]:
rdd_df.collect()

[Row(firstname='James', middlename='', lastname='Smith', dob='1991-04-01', gender='M', salary=3000),
 Row(firstname='Michael', middlename='Rose', lastname='', dob='2000-05-19', gender='M', salary=4000),
 Row(firstname='Robert', middlename='', lastname='Williams', dob='1978-09-05', gender='M', salary=4000),
 Row(firstname='Maria', middlename='Anne', lastname='Jones', dob='1967-12-01', gender='F', salary=4000),
 Row(firstname='Jen', middlename='Mary', lastname='Brown', dob='1980-02-17', gender='F', salary=-1)]

## 4. Reading csv

In [57]:
df_diabetes = spark.read.csv("diabetes.csv", header=True)

In [58]:
print((df_diabetes.count(), len(df_diabetes.columns)))

(768, 9)


In [59]:
df_diabetes.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [60]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



### 4.1 Transforming Spark dataframe to pandas

In [61]:
df_diabetes_pd = df_diabetes.toPandas()

In [62]:
type(df_diabetes_pd)

pandas.core.frame.DataFrame

In [63]:
df_diabetes_pd.value_counts("Outcome")

Outcome
0    500
1    268
dtype: int64

In [64]:
df_diabetes_pd.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## 5. Manipulating DataFrame

In [65]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [66]:
type(df_diabetes)

pyspark.sql.dataframe.DataFrame

### 5.1. Adding columns into the Dataframe

In [67]:
df_diabetes = df_diabetes.withColumn("Age in days", df_diabetes.Age*365)

In [68]:
df_diabetes = df_diabetes.withColumn("Age in days II", col("Age")*365)

In [69]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age in days II|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0|       18250.0|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0|       11315.0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0|       11680.0|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|     7665.0|        7665.0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|    12045.0|

#### 5.1.1. Rename column Name

In [84]:
df_diabetes = df_diabetes.withColumnRenamed("Age in days II", "Age II")

In [86]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+------+---------------+-----------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age II|Patient country|Patient continent|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+------+---------------+-----------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0| 18250|         France|           Europe|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0| 11315|         France|           Europe|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0| 11680|         France|           Europe|
|          1|     89|           66|           23|     94|28.1|                   0

### 5.2. Casting  a column

Let's cast the age columns in days from float to int

In [70]:
df_diabetes.withColumn("Age in days II", col("Age in days II").cast("Integer"))

DataFrame[Pregnancies: string, Glucose: string, BloodPressure: string, SkinThickness: string, Insulin: string, BMI: string, DiabetesPedigreeFunction: string, Age: string, Outcome: string, Age in days: double, Age in days II: int]

In [71]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age in days II|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0|       18250.0|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0|       11315.0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0|       11680.0|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|     7665.0|        7665.0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|    12045.0|

In [72]:
df_diabetes.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)
 |-- Age in days: double (nullable = true)
 |-- Age in days II: double (nullable = true)



In [73]:
df_diabetes = df_diabetes.withColumn("Age in days II", col("Age in days II").cast("Integer"))

In [74]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age in days II|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0|         18250|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0|         11315|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0|         11680|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|     7665.0|          7665|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|    12045.0|

In [75]:
df_diabetes.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)
 |-- Age in days: double (nullable = true)
 |-- Age in days II: integer (nullable = true)



### 5.3. Selecting data with select and getItem

In [35]:
df_temp = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])

In [36]:
df_temp.show()

+------+--------------+
|     l|             d|
+------+--------------+
|[1, 2]|{key -> value}|
+------+--------------+



In [37]:
df_temp.select(df_temp.l.getItem(0), df_temp.d.getItem("key")).show()

+----+------+
|l[0]|d[key]|
+----+------+
|   1| value|
+----+------+



In [44]:
df_temp = spark.createDataFrame([
                ([1, 2], {"key": "value"}),
                ([3, 4, 5], {"name": "math"})
            ], ["l", "d"])

In [45]:
df_temp.show()

+---------+--------------+
|        l|             d|
+---------+--------------+
|   [1, 2]|{key -> value}|
|[3, 4, 5]|{name -> math}|
+---------+--------------+



In [46]:
df_temp.select(df_temp.l.getItem(0), df_temp.d.getItem("key")).show()

+----+------+
|l[0]|d[key]|
+----+------+
|   1| value|
|   3|  null|
+----+------+



In [50]:
df_temp.select(df_temp.l.getItem(2), df_temp.d.getItem("name")).show()

+----+-------+
|l[2]|d[name]|
+----+-------+
|null|   null|
|   5|   math|
+----+-------+



### 5.4. Adding a constant value through lit()

In [76]:
df_diabetes.withColumn("Patient country", lit("France"))

DataFrame[Pregnancies: string, Glucose: string, BloodPressure: string, SkinThickness: string, Insulin: string, BMI: string, DiabetesPedigreeFunction: string, Age: string, Outcome: string, Age in days: double, Age in days II: int, Patient country: string]

In [77]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age in days II|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0|         18250|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0|         11315|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0|         11680|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|     7665.0|          7665|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|    12045.0|

In [78]:
df_diabetes = df_diabetes.withColumn("Patient country", lit("France"))

In [79]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+---------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age in days II|Patient country|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+---------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0|         18250|         France|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0|         11315|         France|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0|         11680|         France|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|     7665.0|          7665|         France|

#### 5.4.1. Going beyond with __**lit()**__ and using __**when()**__

In this case, we are going to same that if the patient is under 30 the patient is from South America, otherwise, the patient is from Europe.

To do that, we are going to use **when**

In [82]:
df_diabetes = df_diabetes.withColumn("Patient continent", when(col("Age")<30, lit("South America")).otherwise(lit("Europe")))

In [83]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+---------------+-----------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age in days II|Patient country|Patient continent|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+--------------+---------------+-----------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0|         18250|         France|           Europe|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0|         11315|         France|           Europe|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0|         11680|         France|           Europe|
|          1|     89|           66

### 5.5. Drop column

In [87]:
df_diabetes = df_diabetes.drop("Patient country")

In [88]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+------+-----------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Age in days|Age II|Patient continent|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----------+------+-----------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|    18250.0| 18250|           Europe|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|    11315.0| 11315|           Europe|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|    11680.0| 11680|           Europe|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|     7665.0|  7665|    South America|
|          0|    137|           40|      

### 5.6. Selecting columns based on conditions

In [103]:
df_diabetes.select(col("Glucose"), col("BloodPressure"), col("Age"))\
            .filter((col("Age")>30) & (col("Age")<35))\
            .show(5)

+-------+-------------+---+
|Glucose|BloodPressure|Age|
+-------+-------------+---+
|     85|           66| 31|
|    183|           64| 32|
|    137|           40| 33|
|    168|           74| 34|
|    100|            0| 32|
+-------+-------------+---+
only showing top 5 rows



#### 5.6.1. Selecting columns based on conditions using alias

In [105]:
df_diabetes.select(col("Glucose"), col("BloodPressure").alias("Blood_presure"), col("Age"))\
            .filter((col("Age")>30) & (col("Age")<35) & (col("Blood_presure")>74))\
            .show(5)

+-------+-------------+---+
|Glucose|Blood_presure|Age|
+-------+-------------+---+
|    118|           84| 31|
|    100|           88| 31|
|    123|           80| 34|
|    122|           90| 31|
|    131|           88| 32|
+-------+-------------+---+
only showing top 5 rows



from datetime import date
 
def age(birthdate):
    today = date.today()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age