In [1]:
# Read csv in pandas
import pandas as pd
pdf = pd.read_csv("/dbfs/FileStore/tables/Iris.csv")

In [2]:
# Read csv in Spark
sdf = spark.read \
  .options(infer_schema = True, header = True) \
  .csv("/FileStore/tables/Iris.csv")

In [3]:
# Print dataframe using pandas
pdf.head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# Print dataframe using spark
sdf.show(5)

In [5]:
# Get column names and datatypes using pandas
pdf.columns

In [6]:
# Get column names and datatypes using pandas
sdf.columns

In [7]:
# Get column datatypes in pandas
pdf.dtypes

In [8]:
# Get column datatypes in spark
sdf.dtypes

In [9]:
# Renaming a column in pandas
pdf.rename(columns = {"Species":"Species_new"}).head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_new
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
# Renaming a column in spark
sdf.withColumnRenamed("Species", "Species_new").show(5)

In [11]:
# Get shape of the table using pandas
pdf.shape

In [12]:
# Get shape of the table using spark
sdf.count(), len(sdf.columns)

In [13]:
# Filter columns in pandas
pdf_filtered = pdf[pdf.SepalLengthCm > 2]
pdf_filtered.head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [14]:
# Filter columns in spark
sdf_filtered = sdf.filter(sdf.SepalLengthCm > 2)
sdf_filtered.show(5)

In [15]:
# Add column in pandas
pdf["SepalLengthCm_new"] = pdf["SepalLengthCm"]*10
pdf.head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,SepalLengthCm_new
0,1,5.1,3.5,1.4,0.2,Iris-setosa,51.0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,49.0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,47.0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,46.0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,50.0


In [16]:
# Add column in spark
import pyspark.sql.functions as f
sdf.withColumn("SepalLengthCm_new", sdf.SepalLengthCm * 10).show(5)

In [17]:
# Get unique values of a column in pandas
pdf['Species'].unique()

In [18]:
# Get unique values of a column in spark
sdf.select('Species').distinct().show()

In [19]:
# Fill NaNs with pandas
pdf = pdf.fillna(0)

In [20]:
# Fill NaNs with spark
sdf = sdf.fillna(0)

In [21]:
# Drop a column in pandas
pdf.drop(["SepalLengthCm_new"], axis=1)
pdf.head(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,SepalLengthCm_new
0,1,5.1,3.5,1.4,0.2,Iris-setosa,51.0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,49.0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,47.0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,46.0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,50.0


In [22]:
# Drop a column in spark
sdf.drop("SepalLengthCm_new").columns

In [23]:
# Describe using pandas
pdf.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,SepalLengthCm_new
count,150.0,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667,58.433333
std,43.445368,0.828066,0.433594,1.76442,0.763161,8.280661
min,1.0,4.3,2.0,1.0,0.1,43.0
25%,38.25,5.1,2.8,1.6,0.3,51.0
50%,75.5,5.8,3.0,4.35,1.3,58.0
75%,112.75,6.4,3.3,5.1,1.8,64.0
max,150.0,7.9,4.4,6.9,2.5,79.0


In [24]:
# Describe using spark
sdf.describe().show()