<h1 style="margin: auto; font-weight: bold; padding: 30px 30px 0px 30px; color:#000;" align="center">Cleaning and Exploring Big Data using PySpark</h1>
<p style="width: 100%; text-align: center; margin: 0px; padding: 0px 0px 30px 0px; font-size: 24px; color:#000;" align="center">| Spark - PySpark practice |</p>


## 1. Libraries

In [1]:
from pyspark import SparkFiles
from pyspark.sql import SparkSession

## 2. Set up Spark

In [2]:
spark = (
    SparkSession
    .builder
    .master("local[1]")
    .appName("PoC")
    .getOrCreate()
)

sc = spark.sparkContext

spark

## 2. Initial testing

In [15]:
rdd_test = sc.parallelize(range(1000))

In [4]:
rdd_test.takeSample(False, 5)

[854, 668, 477, 13, 691]

In [16]:
print(type(rdd_test))

<class 'pyspark.rdd.PipelinedRDD'>


In [17]:
rdd = sc.parallelize([('C',85,76,87,91), ('B',85,76,87,91), ("A", 85,78,96,92), ("A", 92,76,89,96)], 4)

In [18]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


## 3. Using createDataframe()

In [5]:
data = [
    ("James", "", "Smith", "1991-04-01", "M", 3000),
    ("Michael", "Rose", "", "2000-05-19", "M", 4000),
    ("Robert", "", "Williams", "1978-09-05", "M", 4000),
    ("Maria", "Anne", "Jones", "1967-12-01", "F", 4000),
    ("Jen", "Mary", "Brown", "1980-02-17", "F", -1)
]

In [6]:
COLUMNS = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema=COLUMNS)

In [7]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [23]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [13]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


let's transform the dataframe to RDD

In [19]:
rdd_df = df.rdd

In [20]:
print(type(rdd_df))

<class 'pyspark.rdd.RDD'>


In [22]:
rdd_df.collect()

[Row(firstname='James', middlename='', lastname='Smith', dob='1991-04-01', gender='M', salary=3000),
 Row(firstname='Michael', middlename='Rose', lastname='', dob='2000-05-19', gender='M', salary=4000),
 Row(firstname='Robert', middlename='', lastname='Williams', dob='1978-09-05', gender='M', salary=4000),
 Row(firstname='Maria', middlename='Anne', lastname='Jones', dob='1967-12-01', gender='F', salary=4000),
 Row(firstname='Jen', middlename='Mary', lastname='Brown', dob='1980-02-17', gender='F', salary=-1)]

## 4. Reading csv

In [8]:
df_diabetes = spark.read.csv("diabetes.csv", header=True)

In [9]:
print((df_diabetes.count(), len(df_diabetes.columns)))

(768, 9)


In [10]:
df_diabetes.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [11]:
df_diabetes.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows

