# Loading csv files into dataframes

# 1) Set up

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
spark

--------

# 2) Load data into DataFrames: CSV Files

In [5]:
data_path = '../Data'

In [6]:
file_path = data_path + '/location_temp.csv'

In [7]:
df_location = spark.read.format('csv').options(header=True).load(file_path)

In [8]:
df_location.head(10)

[Row(event_date='03/04/2019 19:48:06', location_id='loc0', temp_celcius='29'),
 Row(event_date='03/04/2019 19:53:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 19:58:06', location_id='loc0', temp_celcius='28'),
 Row(event_date='03/04/2019 20:03:06', location_id='loc0', temp_celcius='30'),
 Row(event_date='03/04/2019 20:08:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 20:13:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 20:18:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 20:23:06', location_id='loc0', temp_celcius='29'),
 Row(event_date='03/04/2019 20:28:06', location_id='loc0', temp_celcius='32'),
 Row(event_date='03/04/2019 20:33:06', location_id='loc0', temp_celcius='35')]

In [9]:
# show in tabular format
df_location.show()

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
|03/04/2019 20:38:06|       loc0|          32|
|03/04/2019 20:43:06|       loc0|          28|
|03/04/2019 20:48:06|       loc0|          28|
|03/04/2019 20:53:06|       loc0|          32|
|03/04/2019 20:58:06|       loc0|          34|
|03/04/2019 21:03:06|       loc0|          33|
|03/04/2019 21:08:06|       loc0|          27|
|03/04/2019 21:13:06|       loc0|          28|
|03/04/2019 2

In [10]:
display(df_location)

DataFrame[event_date: string, location_id: string, temp_celcius: string]

In [11]:
# get the count
df_location.count()

500000

## Read file without header, also inferSchema

In [12]:
file_path_no_header = data_path + '/utilization.csv'

df_util = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path_no_header)

In [13]:
df_util.count()

500000

In [15]:
# because there is no header, spark will automatically generate columns names

df_util.head(5)

[Row(_c0='03/05/2019 08:06:14', _c1=100, _c2=0.57, _c3=0.51, _c4=47),
 Row(_c0='03/05/2019 08:11:14', _c1=100, _c2=0.47, _c3=0.62, _c4=43),
 Row(_c0='03/05/2019 08:16:14', _c1=100, _c2=0.56, _c3=0.57, _c4=62),
 Row(_c0='03/05/2019 08:21:14', _c1=100, _c2=0.57, _c3=0.56, _c4=50),
 Row(_c0='03/05/2019 08:26:14', _c1=100, _c2=0.35, _c3=0.46, _c4=43)]

In [16]:
df_util.show(10)

+-------------------+---+----+----+---+
|                _c0|_c1| _c2| _c3|_c4|
+-------------------+---+----+----+---+
|03/05/2019 08:06:14|100|0.57|0.51| 47|
|03/05/2019 08:11:14|100|0.47|0.62| 43|
|03/05/2019 08:16:14|100|0.56|0.57| 62|
|03/05/2019 08:21:14|100|0.57|0.56| 50|
|03/05/2019 08:26:14|100|0.35|0.46| 43|
|03/05/2019 08:31:14|100|0.41|0.58| 48|
|03/05/2019 08:36:14|100|0.57|0.35| 58|
|03/05/2019 08:41:14|100|0.41| 0.4| 58|
|03/05/2019 08:46:14|100|0.53|0.35| 62|
|03/05/2019 08:51:14|100|0.51| 0.6| 45|
+-------------------+---+----+----+---+
only showing top 10 rows



## Rename column names

In [17]:
#  backslash  is the continuing character

df_util = df_util.withColumnRenamed('_c0', 'event_datetime') \
                        .withColumnRenamed('_c1', 'server_id') \
                        .withColumnRenamed('_c2', 'cpu') \
                        .withColumnRenamed('_c3', 'free_memory') \
                        .withColumnRenamed('_c4', 'session_count')

In [18]:
df_util.head(5)

[Row(event_datetime='03/05/2019 08:06:14', server_id=100, cpu=0.57, free_memory=0.51, session_count=47),
 Row(event_datetime='03/05/2019 08:11:14', server_id=100, cpu=0.47, free_memory=0.62, session_count=43),
 Row(event_datetime='03/05/2019 08:16:14', server_id=100, cpu=0.56, free_memory=0.57, session_count=62),
 Row(event_datetime='03/05/2019 08:21:14', server_id=100, cpu=0.57, free_memory=0.56, session_count=50),
 Row(event_datetime='03/05/2019 08:26:14', server_id=100, cpu=0.35, free_memory=0.46, session_count=43)]

In [19]:
df_util.show(5)

+-------------------+---------+----+-----------+-------------+
|     event_datetime|server_id| cpu|free_memory|session_count|
+-------------------+---------+----+-----------+-------------+
|03/05/2019 08:06:14|      100|0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|0.35|       0.46|           43|
+-------------------+---------+----+-----------+-------------+
only showing top 5 rows

