In [1]:
%load_ext watermark
%load_ext lab_black

In [2]:
import pyspark
import pandas as pd
import seaborn as sns

In [3]:
%watermark -iv -v

Python implementation: CPython
Python version       : 3.8.5
IPython version      : 7.23.1

pandas : 1.2.4
seaborn: 0.11.1
pyspark: 3.1.1



#### Loading dataset and exploring 

In [4]:
%%time
df = sns.load_dataset("tips")

CPU times: user 11.5 ms, sys: 11.2 ms, total: 22.6 ms
Wall time: 48.7 ms


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
df.to_csv("tips.csv", index=False)

In [8]:
# create spark session
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName("Exploring-spark").getOrCreate()

In [10]:
spark

In [11]:
%%time
df_pyspark = spark.read.csv('tips.csv')

CPU times: user 5.69 ms, sys: 0 ns, total: 5.69 ms
Wall time: 6.23 s


In [12]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string]

In [13]:
df_pyspark.show()

+----------+----+------+------+---+------+----+
|       _c0| _c1|   _c2|   _c3|_c4|   _c5| _c6|
+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinne

In [14]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [15]:
# Another way to read csv with header true so it shows header content
df_pyspark = spark.read.option("header", "true").csv("tips.csv")
df_pyspark

DataFrame[total_bill: string, tip: string, sex: string, smoker: string, day: string, time: string, size: string]

In [16]:
df_pyspark.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [17]:
# check the schema of df
df_pyspark.printSchema()

root
 |-- total_bill: string (nullable = true)
 |-- tip: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: string (nullable = true)



In [31]:
# by default all types is shown as string
# the best way to read csv files is as follows
df_pyspark = spark.read.csv("tips.csv", header=True, inferSchema=True)
df_pyspark.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [32]:
# as we passed inferSchema while reading, it is displayed properly now.
df_pyspark.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [33]:
# like in pandas we can use head in spark too, by default its 1
df_pyspark.head()

Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2)

In [34]:
# by default show takes 20 rows, but we can provide how much we want
df_pyspark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [35]:
# check the dtypes as in pandas
df_pyspark.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'int')]

In [36]:
# describe
df_pyspark.describe().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950824|2.9982786885245902|  null|  null|null|  null| 2.569672131147541|
| stddev| 8.902411954856857|1.3836381890011815|  null|  null|null|  null|0.9510998047322347|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



In [37]:
# adding columns in the dataframe
df_pyspark = df_pyspark.withColumn("Increased tip", df_pyspark["tip"] + 2)

In [38]:
df_pyspark.show()

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     Increased tip|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|              3.01|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|              3.66|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|               5.5|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|5.3100000000000005|
|     24.59|3.61|Female|    No|Sun|Dinner|   4| 5.609999999999999|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|              6.71|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|               4.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|              5.12|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|              3.96|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|              5.23|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|              3.71|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|               

In [39]:
# droping the columns from df
df_pyspark = df_pyspark.drop("Increased tip")
df_pyspark.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [40]:
# renaming columns
df_pyspark = df_pyspark.withColumnRenamed("total_bill", "total_amount")

In [41]:
df_pyspark.show(3)

+------------+----+------+------+---+------+----+
|total_amount| tip|   sex|smoker|day|  time|size|
+------------+----+------+------+---+------+----+
|       16.99|1.01|Female|    No|Sun|Dinner|   2|
|       10.34|1.66|  Male|    No|Sun|Dinner|   3|
|       21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+------------+----+------+------+---+------+----+
only showing top 3 rows



### Conclusion
- `pyspark session` needs to be created before exploring pyspark
- pyspark dataframe is similar to pandas but has its own methods
- `head` in pandas is same as `show` in pyspark (although we can use head in pyspark too but it doesn't display for easy readable)
- `adding columns, droping and renaming columns` can be done but in pyspark there is no inplace. Probably its because while handling large data, parallelism needs to be applied and it can create some problem.
- reading csv file is easy as in pandas but little bit different format.
- `dtypes` is similar
                                                 