In [1]:
import pandas as pd

from pyspark.sql import SparkSession

In [2]:
! ls data/

T1.csv


In [3]:
sc = SparkSession.builder.appName('Cheatsheet').getOrCreate()

21/07/24 20:45:21 WARN Utils: Your hostname, sid-HP-240-G3-Notebook-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.100 instead (on interface wlp9s0f0)
21/07/24 20:45:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/07/24 20:45:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/07/24 20:45:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Reading CSV

In [5]:
# Reading CSV in pandas
df = pd.read_csv('data/T1.csv')

# Reading CSV in PySpark
spark_df = sc.read.option("header",True) \
                  .csv("data/T1.csv")

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50530 entries, 0 to 50529
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Date/Time                      50530 non-null  object 
 1   LV ActivePower (kW)            50530 non-null  float64
 2   Wind Speed (m/s)               50530 non-null  float64
 3   Theoretical_Power_Curve (KWh)  50530 non-null  float64
 4   Wind Direction (°)             50530 non-null  float64
dtypes: float64(4), object(1)
memory usage: 1.9+ MB


In [7]:
spark_df.printSchema()

root
 |-- Date/Time: string (nullable = true)
 |-- LV ActivePower (kW): string (nullable = true)
 |-- Wind Speed (m/s): string (nullable = true)
 |-- Theoretical_Power_Curve (KWh): string (nullable = true)
 |-- Wind Direction (°): string (nullable = true)



### Nerdy Tip

* **pandas** uses auto infer mechanism by default while in **PySpark** the `inferSchema` is set to `False` by default.


* While reading the CSV in pandas use `parse_dates` parameter for converting the column type to `datetime`.

## Infer dtypes while reading CSV

In [8]:
df = pd.read_csv('data/T1.csv', parse_dates=['Date/Time'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50530 entries, 0 to 50529
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Date/Time                      50530 non-null  datetime64[ns]
 1   LV ActivePower (kW)            50530 non-null  float64       
 2   Wind Speed (m/s)               50530 non-null  float64       
 3   Theoretical_Power_Curve (KWh)  50530 non-null  float64       
 4   Wind Direction (°)             50530 non-null  float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 1.9 MB


In [10]:
spark_df = sc.read.options(header=True, inferSchema=True) \
                  .csv("data/T1.csv")

                                                                                

In [11]:
spark_df.printSchema()

root
 |-- Date/Time: string (nullable = true)
 |-- LV ActivePower (kW): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Theoretical_Power_Curve (KWh): double (nullable = true)
 |-- Wind Direction (°): double (nullable = true)

