# Basic Dataframe Operations

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark

--------

# 1) Load data into DataFrames: JSON files

In [10]:
data_path = '../Data'
file_path = data_path + '/utilization.json'

In [11]:
df = spark.read.format('json').load(file_path)

In [12]:
df.show()

+----+-------------------+-----------+---------+-------------+
| cpu|     event_datetime|free_memory|server_id|session_count|
+----+-------------------+-----------+---------+-------------+
|0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|0.35|03/05/2019 08:26:14|       0.46|      100|           43|
+----+-------------------+-----------+---------+-------------+



In [13]:
df.count()

5

--------

# 2) Basic DataFrame Operations
- Sampling
- Sorting

In [15]:
# check the columns name
df.columns

['cpu', 'event_datetime', 'free_memory', 'server_id', 'session_count']

## Sample or Subset of data
- there are two ways to get sample (with replacement or without replacement)
- replacement meaning we might still get the already drawn out row , on next time we draw a new sample from the dataframe


In [19]:
# get sample without replacement for 20% of original dataset
df_sample = df.sample(False, 0.2)

In [20]:
df_sample.show()

+----+-------------------+-----------+---------+-------------+
| cpu|     event_datetime|free_memory|server_id|session_count|
+----+-------------------+-----------+---------+-------------+
|0.47|03/05/2019 08:11:14|       0.62|      100|           43|
+----+-------------------+-----------+---------+-------------+



In [24]:
df.count() * 0.2

1.0

--------

## Sort the data

In [31]:
df.show()

+----+-------------------+-----------+---------+-------------+
| cpu|     event_datetime|free_memory|server_id|session_count|
+----+-------------------+-----------+---------+-------------+
|0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|0.35|03/05/2019 08:26:14|       0.46|      100|           43|
+----+-------------------+-----------+---------+-------------+



In [29]:
# sort by session count decending order

df_sorted = df.sort('session_count', ascending=False)

In [30]:
df_sorted.show()

+----+-------------------+-----------+---------+-------------+
| cpu|     event_datetime|free_memory|server_id|session_count|
+----+-------------------+-----------+---------+-------------+
|0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|0.35|03/05/2019 08:26:14|       0.46|      100|           43|
+----+-------------------+-----------+---------+-------------+

