## Importing the Libraries

In [38]:
from pyspark.sql import SparkSession

In [39]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [40]:
spark

## Reading the Dataset

In [41]:
df_pyspark = spark.read.option('header', 'true').csv('Food Demand.csv', inferSchema = True)

In [42]:
df_pyspark.show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|     id|week|center_id|meal_id|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|                0|       271|
|1000005| 101|       65|   1754|        291.03|    290.03|                    0|                0|       541|
|1000006| 

In [43]:
df_pyspark.printSchema()

root
 |-- id: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- center_id: integer (nullable = true)
 |-- meal_id: integer (nullable = true)
 |-- checkout_price: double (nullable = true)
 |-- base_price: double (nullable = true)
 |-- emailer_for_promotion: integer (nullable = true)
 |-- homepage_featured: integer (nullable = true)
 |-- num_orders: integer (nullable = true)



Another Simple Way:

In [44]:
df_pyspark = spark.read.csv("Food demand.csv", header=True, inferSchema=True)

In [45]:
df_pyspark.show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|     id|week|center_id|meal_id|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|                0|       271|
|1000005| 101|       65|   1754|        291.03|    290.03|                    0|                0|       541|
|1000006| 

In [46]:
df_pyspark.printSchema()

root
 |-- id: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- center_id: integer (nullable = true)
 |-- meal_id: integer (nullable = true)
 |-- checkout_price: double (nullable = true)
 |-- base_price: double (nullable = true)
 |-- emailer_for_promotion: integer (nullable = true)
 |-- homepage_featured: integer (nullable = true)
 |-- num_orders: integer (nullable = true)



In [47]:
df_pyspark.columns

['id',
 'week',
 'center_id',
 'meal_id',
 'checkout_price',
 'base_price',
 'emailer_for_promotion',
 'homepage_featured',
 'num_orders']

In [48]:
df_pyspark.head(5)

[Row(id=1000000, week=3, center_id=157, meal_id=2760, checkout_price=233.83, base_price=231.83, emailer_for_promotion=0, homepage_featured=0, num_orders=149),
 Row(id=1000001, week=100, center_id=104, meal_id=2956, checkout_price=486.03, base_price=583.03, emailer_for_promotion=0, homepage_featured=0, num_orders=161),
 Row(id=1000002, week=143, center_id=75, meal_id=1971, checkout_price=328.86, base_price=327.86, emailer_for_promotion=0, homepage_featured=0, num_orders=149),
 Row(id=1000003, week=41, center_id=24, meal_id=2539, checkout_price=145.53, base_price=145.53, emailer_for_promotion=0, homepage_featured=0, num_orders=540),
 Row(id=1000004, week=45, center_id=83, meal_id=2539, checkout_price=95.06, base_price=120.34, emailer_for_promotion=0, homepage_featured=0, num_orders=271)]

## Viewing a single column

In [49]:
df_pyspark.select('week')

DataFrame[week: int]

In [50]:
df_pyspark.select('week').show()

+----+
|week|
+----+
|   3|
| 100|
| 143|
|  41|
|  45|
| 101|
| 107|
|  11|
| 114|
|  68|
|  10|
|  18|
|  33|
| 140|
| 105|
| 112|
| 131|
|   9|
| 101|
|  18|
+----+
only showing top 20 rows



In [51]:
df_pyspark.select(['week', 'meal_id']).show()

+----+-------+
|week|meal_id|
+----+-------+
|   3|   2760|
| 100|   2956|
| 143|   1971|
|  41|   2539|
|  45|   2539|
| 101|   1754|
| 107|   2126|
|  11|   1062|
| 114|   1962|
|  68|   1216|
|  10|   2760|
|  18|   2867|
|  33|   2494|
| 140|   1571|
| 105|   2139|
| 112|   2290|
| 131|   2290|
|   9|   2826|
| 101|   1445|
|  18|   1311|
+----+-------+
only showing top 20 rows



## Checking Data types

In [52]:
df_pyspark.dtypes

[('id', 'int'),
 ('week', 'int'),
 ('center_id', 'int'),
 ('meal_id', 'int'),
 ('checkout_price', 'double'),
 ('base_price', 'double'),
 ('emailer_for_promotion', 'int'),
 ('homepage_featured', 'int'),
 ('num_orders', 'int')]

In [53]:
df_pyspark.describe().show()

+-------+-----------------+-----------------+------------------+------------------+------------------+------------------+---------------------+-------------------+------------------+
|summary|               id|             week|         center_id|           meal_id|    checkout_price|        base_price|emailer_for_promotion|  homepage_featured|        num_orders|
+-------+-----------------+-----------------+------------------+------------------+------------------+------------------+---------------------+-------------------+------------------+
|  count|             1999|             1999|              1999|              1999|              1999|              1999|                 1999|               1999|              1999|
|   mean|1001093.100050025|75.39319659829916| 81.64982491245622|2010.1235617808904|327.30259629815237|347.97286643321996|  0.07653826913456728|0.10505252626313157|258.34017008504253|
| stddev|632.3493185379757|41.74380215978922|46.139172749208925| 554.6865253289483|15

## Adding a Column

In [54]:
df_pyspark = df_pyspark.withColumn('base_price_USD', df_pyspark["checkout_price"]/80)

In [55]:
df_pyspark.show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+------------------+
|     id|week|center_id|meal_id|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|    base_price_USD|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+------------------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|2.9228750000000003|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161| 6.075374999999999|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|           4.11075|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|          1.819125|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|                0

## Dropping a Column

In [56]:
df_pyspark = df_pyspark.drop('base_price_USD')

In [57]:
df_pyspark.show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|     id|week|center_id|meal_id|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|                0|       271|
|1000005| 101|       65|   1754|        291.03|    290.03|                    0|                0|       541|
|1000006| 

## Renaming a column

In [58]:
df_pyspark.withColumnRenamed('base_price', 'BASE_PRICE').show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|     id|week|center_id|meal_id|checkout_price|BASE_PRICE|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|                0|       271|
|1000005| 101|       65|   1754|        291.03|    290.03|                    0|                0|       541|
|1000006| 