In [1]:
import os
os.environ['SPARK_HOME'] = r'C:\Users\Marcos\Documents\Spark'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [51]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

In [3]:
spark = SparkSession.builder.appName('DataFrame-Operations').getOrCreate()

In [4]:
file_path = 'data/walmart-sales-dataset-of-45stores.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [5]:
df.printSchema()
df.show(5)

root
 |-- Store: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Weekly_Sales: double (nullable = true)
 |-- Holiday_Flag: integer (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- CPI: double (nullable = true)
 |-- Unemployment: double (nullable = true)

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|    1|05-02-2010|   1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|
|    1|12-02-2010|  1641957.44|           1|      38.51|     2.548|211.2421698|       8.106|
|    1|19-02-2010|  1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|
|    1|26-02-2010|  1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|
|    1|05-03-201

In [10]:
selected_columns = df.select('store', 'Weekly_Sales', 'Fuel_Price')
selected_columns.show(5)

+-----+------------+----------+
|store|Weekly_Sales|Fuel_Price|
+-----+------------+----------+
|    1|   1643690.9|     2.572|
|    1|  1641957.44|     2.548|
|    1|  1611968.17|     2.514|
|    1|  1409727.59|     2.561|
|    1|  1554806.68|     2.625|
+-----+------------+----------+
only showing top 5 rows



In [20]:
filter_data = df.filter(df.Store >= 25)
filter_data.show(5)
filter_data.count()

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|   25|05-02-2010|   677231.63|           0|       21.1|     2.784|204.2471935|       8.187|
|   25|12-02-2010|   583364.02|           1|      19.64|     2.773|204.3857472|       8.187|
|   25|19-02-2010|   676260.67|           0|      24.16|     2.745|204.4321004|       8.187|
|   25|26-02-2010|   628516.57|           0|      29.16|     2.754|204.4630869|       8.187|
|   25|05-03-2010|   665750.06|           0|      29.45|     2.777|204.4940734|       8.187|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



3003

In [21]:
grouped_data = df.groupBy('Store').agg({'CPI':'mean', 'Holiday_Flag':'sum'})
grouped_data.show(5)

+-----+-----------------+------------------+
|Store|sum(Holiday_Flag)|          avg(CPI)|
+-----+-----------------+------------------+
|   31|               10|215.64631062727264|
|   34|               10| 128.6796694608392|
|   28|               10| 128.6796694608392|
|   26|               10|135.09260732587418|
|   27|               10| 139.0112835083916|
+-----+-----------------+------------------+
only showing top 5 rows



In [28]:
df1 = df.select('Store', 'Weekly_Sales')
df2 = df.select('Store', 'Temperature')

joined_data = df1.join(df2, 'Store', 'inner')
joined_data.show(5)

+-----+------------+-----------+
|Store|Weekly_Sales|Temperature|
+-----+------------+-----------+
|    1|   1643690.9|      69.16|
|    1|   1643690.9|      67.97|
|    1|   1643690.9|      62.99|
|    1|   1643690.9|      68.55|
|    1|   1643690.9|      76.08|
+-----+------------+-----------+
only showing top 5 rows



In [30]:
sorted_data = df.orderBy('Weekly_Sales')
sorted_data.show(5)

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|   33|03-12-2010|   209986.25|           0|      52.82|     3.041|126.7313333|       9.265|
|   33|29-10-2010|   213538.32|           0|      71.34|      3.13|126.4364194|       9.265|
|   33|30-12-2011|   215359.21|           1|       51.6|     3.428|130.0710323|        8.01|
|   33|31-12-2010|   219804.85|           1|      52.91|     3.148|127.0876774|       9.265|
|   33|02-12-2011|   220060.35|           0|      59.12|     3.701|129.8459667|        8.01|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



In [35]:
sorted_data = df.orderBy(col('Weekly_Sales'), col('Fuel_Price').desc())
sorted_data.show(5)

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|   33|03-12-2010|   209986.25|           0|      52.82|     3.041|126.7313333|       9.265|
|   33|29-10-2010|   213538.32|           0|      71.34|      3.13|126.4364194|       9.265|
|   33|30-12-2011|   215359.21|           1|       51.6|     3.428|130.0710323|        8.01|
|   33|31-12-2010|   219804.85|           1|      52.91|     3.148|127.0876774|       9.265|
|   33|02-12-2011|   220060.35|           0|      59.12|     3.701|129.8459667|        8.01|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



In [38]:
distinc_data = df.select('Store').distinct().orderBy(col('Store').desc())
distinc_data.show()

+-----+
|Store|
+-----+
|   45|
|   44|
|   43|
|   42|
|   41|
|   40|
|   39|
|   38|
|   37|
|   36|
|   35|
|   34|
|   33|
|   32|
|   31|
|   30|
|   29|
|   28|
|   27|
|   26|
+-----+
only showing top 20 rows



In [40]:
df_dropped = df.drop('Date')
df_dropped.show(5)

+-----+------------+------------+-----------+----------+-----------+------------+
|Store|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+------------+------------+-----------+----------+-----------+------------+
|    1|   1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|
|    1|  1641957.44|           1|      38.51|     2.548|211.2421698|       8.106|
|    1|  1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|
|    1|  1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|
|    1|  1554806.68|           0|       46.5|     2.625|211.3501429|       8.106|
+-----+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



In [53]:
df_new_col = df.withColumn('WT', df.Weekly_Sales / df.Temperature)
df_new_col.show(5)

+-----+----------+------------+------------+-----------+----------+-----------+------------+------------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|                WT|
+-----+----------+------------+------------+-----------+----------+-----------+------------+------------------+
|    1|05-02-2010|   1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|  38848.7567950839|
|    1|12-02-2010|  1641957.44|           1|      38.51|     2.548|211.2421698|       8.106| 42637.17060503765|
|    1|19-02-2010|  1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|40369.851490107685|
|    1|26-02-2010|  1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|30232.202230323826|
|    1|05-03-2010|  1554806.68|           0|       46.5|     2.625|211.3501429|       8.106|33436.702795698926|
+-----+----------+------------+------------+-----------+----------+-----------+------------+------------

In [54]:
df_alias = df.withColumnRenamed('Weekly_Sales', 'Sales')
df_alias.show(5)

+-----+----------+----------+------------+-----------+----------+-----------+------------+
|Store|      Date|     Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+----------+------------+-----------+----------+-----------+------------+
|    1|05-02-2010| 1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|
|    1|12-02-2010|1641957.44|           1|      38.51|     2.548|211.2421698|       8.106|
|    1|19-02-2010|1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|
|    1|26-02-2010|1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|
|    1|05-03-2010|1554806.68|           0|       46.5|     2.625|211.3501429|       8.106|
+-----+----------+----------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



In [55]:
spark.stop()