In [1]:
import pandas as pd

from pyspark.sql import SparkSession

In [2]:
! ls ../data/

T1.csv


In [3]:
sc = SparkSession.builder.appName("Cheatsheet").getOrCreate()

# Reading CSV

### pandas

In [4]:
df = pd.read_csv("../data/T1.csv")

### PySpark

In [5]:
spark_df = sc.read.option("header",True) \
     .csv("../data/T1.csv")

## Shape of the DataFrame

### pandas

In [6]:
df.shape

(50530, 5)

### Pyspark

In [7]:
spark_df.count(), len(spark_df.columns)

(50530, 5)

**Note:** There is no `shape()` method in PySpark.

## Inspecting head and tail of the dataframe

### pandas

In [8]:
df.head()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
0,01 01 2018 00:00,380.047791,5.311336,416.328908,259.994904
1,01 01 2018 00:10,453.769196,5.672167,519.917511,268.641113
2,01 01 2018 00:20,306.376587,5.216037,390.900016,272.564789
3,01 01 2018 00:30,419.645905,5.659674,516.127569,271.258087
4,01 01 2018 00:40,380.650696,5.577941,491.702972,265.674286


In [9]:
df.tail()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
50525,31 12 2018 23:10,2963.980957,11.40403,3397.190793,80.502724
50526,31 12 2018 23:20,1684.353027,7.332648,1173.055771,84.062599
50527,31 12 2018 23:30,2201.106934,8.435358,1788.284755,84.7425
50528,31 12 2018 23:40,2515.694092,9.421366,2418.382503,84.297913
50529,31 12 2018 23:50,2820.466064,9.979332,2779.184096,82.27462


### PySpark

In [10]:
spark_df.show()

+----------------+-------------------+----------------+-----------------------------+------------------+
|       Date/Time|LV ActivePower (kW)|Wind Speed (m/s)|Theoretical_Power_Curve (KWh)|Wind Direction (°)|
+----------------+-------------------+----------------+-----------------------------+------------------+
|01 01 2018 00:00|   380.047790527343|5.31133604049682|             416.328907824861|  259.994903564453|
|01 01 2018 00:10|    453.76919555664|5.67216682434082|             519.917511061494|   268.64111328125|
|01 01 2018 00:20|   306.376586914062|5.21603679656982|             390.900015810951|  272.564788818359|
|01 01 2018 00:30|   419.645904541015|5.65967416763305|             516.127568975674|  271.258087158203|
|01 01 2018 00:40|   380.650695800781|5.57794094085693|             491.702971953588|  265.674285888671|
|01 01 2018 00:50|   402.391998291015|5.60405206680297|             499.436385024805|   264.57861328125|
|01 01 2018 01:00|   447.605712890625|5.79300785064697|

In [11]:
spark_df.tail(5)

[Row(Date/Time='31 12 2018 23:10', LV ActivePower (kW)='2963.98095703125', Wind Speed (m/s)='11.4040298461914', Theoretical_Power_Curve (KWh)='3397.19079251158', Wind Direction (°)='80.5027236938476'),
 Row(Date/Time='31 12 2018 23:20', LV ActivePower (kW)='1684.35302734375', Wind Speed (m/s)='7.33264780044555', Theoretical_Power_Curve (KWh)='1173.05577118814', Wind Direction (°)='84.0625991821289'),
 Row(Date/Time='31 12 2018 23:30', LV ActivePower (kW)='2201.10693359375', Wind Speed (m/s)='8.43535804748535', Theoretical_Power_Curve (KWh)='1788.28475526396', Wind Direction (°)='84.7425003051757'),
 Row(Date/Time='31 12 2018 23:40', LV ActivePower (kW)='2515.69409179687', Wind Speed (m/s)='9.42136573791503', Theoretical_Power_Curve (KWh)='2418.38250336009', Wind Direction (°)='84.2979125976562'),
 Row(Date/Time='31 12 2018 23:50', LV ActivePower (kW)='2820.46606445312', Wind Speed (m/s)='9.97933197021484', Theoretical_Power_Curve (KWh)='2779.18409628274', Wind Direction (°)='82.2746200

### Nerdy Tip

The `head()` in **pandas** is by default set to show first 5 records while in **PySpark** the `show()` is  set to 20 records.

## Inspecting datatype of the columns

### pandas

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50530 entries, 0 to 50529
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Date/Time                      50530 non-null  object 
 1   LV ActivePower (kW)            50530 non-null  float64
 2   Wind Speed (m/s)               50530 non-null  float64
 3   Theoretical_Power_Curve (KWh)  50530 non-null  float64
 4   Wind Direction (°)             50530 non-null  float64
dtypes: float64(4), object(1)
memory usage: 1.9+ MB


In [13]:
df.dtypes

Date/Time                         object
LV ActivePower (kW)              float64
Wind Speed (m/s)                 float64
Theoretical_Power_Curve (KWh)    float64
Wind Direction (°)               float64
dtype: object

### PySpark

In [14]:
spark_df.printSchema()

root
 |-- Date/Time: string (nullable = true)
 |-- LV ActivePower (kW): string (nullable = true)
 |-- Wind Speed (m/s): string (nullable = true)
 |-- Theoretical_Power_Curve (KWh): string (nullable = true)
 |-- Wind Direction (°): string (nullable = true)



In [15]:
spark_df.dtypes

[('Date/Time', 'string'),
 ('LV ActivePower (kW)', 'string'),
 ('Wind Speed (m/s)', 'string'),
 ('Theoretical_Power_Curve (KWh)', 'string'),
 ('Wind Direction (°)', 'string')]