In [1]:
!pip install pyspark dask[dataframe] --quiet

import pandas as pd
import dask.dataframe as dd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start Spark session
spark = SparkSession.builder.appName("SalesDataAnalysis").getOrCreate()


In [2]:
from google.colab import drive
drive.mount('/content/drive')
#  1. DataFrame Creation and Inspection
df = spark.read.csv("/content/drive/MyDrive/Sales_Dataset__500_Records_.csv", header=True, inferSchema=True)

# Show schema to verify
df.printSchema()

Mounted at /content/drive
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- ProductCategory: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- DeliveryStatus: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- CustomerSince: date (nullable = true)



In [5]:
df.show(5)  # First 5 records
df.tail(5)  # Last 5 is not available in PySpark directly; use this:
df.orderBy("OrderID", ascending=False).show(5)

# df.printSchema()
df.dtypes


+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

[('OrderID', 'int'),
 ('CustomerName', 'string'),
 ('ProductCategory', 'string'),
 ('Amount', 'double'),
 ('OrderDate', 'date'),
 ('DeliveryStatus', 'string'),
 ('Discount', 'double'),
 ('City', 'string'),
 ('PaymentMode', 'string'),
 ('CustomerSince', 'date')]

In [9]:
# Task 2: Selection, Renaming, Filtering
# Select columns
df_selected = df.select("OrderID", "CustomerName", "Amount")
df_selected.show(5)

+-------+--------------+------+
|OrderID|  CustomerName|Amount|
+-------+--------------+------+
|   2824| Donald Walker|783.04|
|   7912|  Brandon Hall| 905.0|
|   4611|  Donald Booth|657.96|
|   3547|Phillip Garcia|606.89|
|   8527|  Valerie Gray| 77.87|
+-------+--------------+------+
only showing top 5 rows



In [10]:
# Rename Amount
df_renamed = df_selected.withColumnRenamed("Amount", "OrderAmount")
df_renamed.show(5)

+-------+--------------+-----------+
|OrderID|  CustomerName|OrderAmount|
+-------+--------------+-----------+
|   2824| Donald Walker|     783.04|
|   7912|  Brandon Hall|      905.0|
|   4611|  Donald Booth|     657.96|
|   3547|Phillip Garcia|     606.89|
|   8527|  Valerie Gray|      77.87|
+-------+--------------+-----------+
only showing top 5 rows



In [12]:
# Filter Amount > 500
df_filtered_amount = df.filter(col("Amount") > 500)
df_filtered_amount.show(5)

+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   6155|Jonathan Wilkerson|        Fashion|882.68|2024-10-14|     Cancelled|    0.27|    

In [15]:
# Filter by city
df_filtered_city = df.filter(col("City") == "Lake Joyside")
df_filtered_city.show(5)

+-------+-------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+
|OrderID| CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|        City|PaymentMode|CustomerSince|
+-------+-------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+
|   2824|Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|Lake Joyside|Credit Card|   2020-10-15|
+-------+-------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+



In [18]:
# Task 3: Data Manipulation
# Drop CustomerSince column
df_dropped = df.drop("CustomerSince")
df_dropped.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----

In [19]:
# FinalAmount = Amount - (Amount * Discount)
df=df.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
df.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      FinalAmount|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|          665.584|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|           877.85|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|         651.3804|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|         51

In [21]:
# Sort by FinalAmount descending
df_sorted = df.orderBy(col("FinalAmount").desc())
df_sorted.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+-----------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|        City|PaymentMode|CustomerSince|      FinalAmount|
+-------+--------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+-----------------+
|   5573|Jordan Frazier|          Books|981.05|2025-03-19|     Cancelled|    0.02| Sheilaville|       Cash|   2021-07-12|          961.429|
|   8474|   Heidi Brown|    Electronics|968.91|2023-11-23|     Cancelled|    0.02|  Riverafort|       Cash|   2023-03-19|         949.5318|
|   8889|   Karen Garza|          Books| 998.3|2024-10-17|     Cancelled|    0.06|  Johnsonton|Credit Card|   2020-12-17|938.4019999999999|
|   2127|  Jaclyn Moore|      Groceries|933.32|2025-03-11|      Returned|    0.01| Cherylhaven|       Cash|   2020-06-14|         923.9868|
|   9806| Samantha G

In [24]:
# Replace “Cancelled” with “Order Cancelled”
from pyspark.sql.functions import col, when, year, month, current_date, datediff, to_date, expr, lit
df_replaced = df.withColumn("DeliveryStatus", when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus")))
df_replaced.show(5)

+-------+--------------+---------------+------+----------+---------------+--------+----------------+-----------+-------------+-----------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate| DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      FinalAmount|
+-------+--------------+---------------+------+----------+---------------+--------+----------------+-----------+-------------+-----------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|       Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|          665.584|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|Order Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|           877.85|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|       Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|         651.3804|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|       Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|    

In [25]:
# Task 4: Aggregations and GroupBy
# Count of orders by DeliveryStatus
df.groupBy("DeliveryStatus").count().show()

+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+



In [26]:
# Average Amount by ProductCategory
df.groupBy("ProductCategory").avg("Amount").show()

+---------------+------------------+
|ProductCategory|       avg(Amount)|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+



In [27]:
# Group by City and show total sales
df.groupBy("City").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSales").show()

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|      Thomasberg|    882.68|
|     Laurenville|    383.26|
| South Colinstad|    786.27|
|    Lake Douglas|    975.09|
|   Williamsmouth|     10.78|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|        Seanbury|    814.39|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Lisaville|     45.69|
| Lake Jerrymouth|    404.01|
|       Perezfort|    917.55|
|Port Nicoleshire|    133.78|
|  South Samantha|    229.46|
|     Port Willie|    788.13|
|     Waltersfort|    552.81|
|       Youngbury|    372.95|
+----------------+----------+
only showing top 20 rows



In [30]:
# Task 5: Null Handling & Update
# Inject nulls randomly into City (for simulation)
from pyspark.sql.functions import rand
df_with_nulls = df.withColumn("City", when(rand() < 0.1, None).otherwise(col("City")))
df_with_nulls.show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|           665.584|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|            877.85|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|          651.3804|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melaniev

In [31]:
# Handle nulls: fillna and dropna
df_fillna = df_with_nulls.fillna({"City": "Unknown"}).show()
df_dropna = df_with_nulls.dropna(subset=["City"]).show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|           665.584|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|            877.85|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|          651.3804|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melaniev

In [32]:
# Tag high-value customers (Amount > 800)
df_tagged = df.withColumn("CustomerType", when(col("Amount") > 800, "High-Value").otherwise("Regular")).show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      FinalAmount|CustomerType|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|          665.584|     Regular|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|           877.85|  High-Value|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|         651.3804|     Regular|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|

In [33]:
# Task 6: Date & Time Functions
# Ensure OrderDate is in date format
df = df.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))
df = df.withColumn("CustomerSince", to_date(col("CustomerSince"), "yyyy-MM-dd"))

In [34]:
# Extract year and month from OrderDate
df_date = df.withColumn("OrderYear", year(col("OrderDate"))).withColumn("OrderMonth", month(col("OrderDate")))
df_date.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+---------+----------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      FinalAmount|OrderYear|OrderMonth|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+---------+----------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|          665.584|     2024|        12|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|           877.85|     2024|         9|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|         651.3804|     2025|         1|
|   3547|P

In [35]:
# Calculate loyalty in years
df_loyalty = df_date.withColumn("LoyaltyYears", (datediff(current_date(), col("CustomerSince")) / 365).cast("int")).show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+---------+----------+------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      FinalAmount|OrderYear|OrderMonth|LoyaltyYears|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+---------+----------+------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|          665.584|     2024|        12|           4|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|           877.85|     2024|         9|           3|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet

In [36]:
# Task 7: Joins and Unions
# Create city-region mapping DataFrame
region_data = [("Lake Joyside", "East"), ("New Jamesside", "West"), ("Lake Roberto", "Central")]
region_df = spark.createDataFrame(region_data, ["City", "Region"])
region_df.show()

+-------------+-------+
|         City| Region|
+-------------+-------+
| Lake Joyside|   East|
|New Jamesside|   West|
| Lake Roberto|Central|
+-------------+-------+



In [38]:
# Inner join
df_inner = df.join(region_df, on="City", how="inner")
df_inner.show(5)

+-------------+-------+-------------+---------------+------+----------+--------------+--------+-----------+-------------+-----------+-------+
|         City|OrderID| CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|FinalAmount| Region|
+-------------+-------+-------------+---------------+------+----------+--------------+--------+-----------+-------------+-----------+-------+
| Lake Joyside|   2824|Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|Credit Card|   2020-10-15|    665.584|   East|
|New Jamesside|   7912| Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|     Wallet|   2022-03-15|     877.85|   West|
| Lake Roberto|   4611| Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Wallet|   2021-08-07|   651.3804|Central|
+-------------+-------+-------------+---------------+------+----------+--------------+--------+-----------+-------------+-----------+-------+



In [39]:
# Left join
df_left = df.join(region_df, on="City", how="left")
df_left.show(5)

+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+-----------+-------+
|            City|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|FinalAmount| Region|
+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+-----------+-------+
| Port Jesseville|   4150|   Amber Perez|          Books|352.37|2024-01-13|     Cancelled|    0.24|       Cash|   2022-01-13|   267.8012|   NULL|
|    Lake Joyside|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|Credit Card|   2020-10-15|    665.584|   East|
|    Lake Roberto|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Wallet|   2021-08-07|   651.3804|Central|
|West Melanieview|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|     Wallet|   2020-08-08

In [40]:
# Union orders from 2023 and 2024
df_2023 = df.filter(year("OrderDate") == 2023)
df_2024 = df.filter(year("OrderDate") == 2024)
df_union = df_2023.union(df_2024)
df_union.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|
+-------+--------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+
|   2169|Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|         Grayside|Credit Card|   2021-05-09|10.567499999999999|
|   6313|   Patty Perez|      Groceries| 79.83|2023-06-27|     Cancelled|    0.12|      Richardland|Credit Card|   2021-04-25|           70.2504|
|   2040| Kyle Mcdonald|           Toys|327.52|2023-12-15|      Returned|    0.06|Lake Jenniferside|     Wallet|   2021-07-21|307.86879999999996|
|   6038| David Bradley|        Fashion|348.51|2023-08-03|      Returned|    0.23|    Lake Toddland|        UPI|   2022-09-0

In [41]:
# Task 8: Complex JSON Simulation
# Convert each row to JSON string
df_json = df.toJSON()

In [42]:
# Load JSON back into DataFrame
df_loaded = spark.read.json(df_json)

In [43]:
# Exploding not required unless nested structure exists
df_loaded.select("CustomerName", "Amount", "DeliveryStatus").show()

+------------------+------+--------------+
|      CustomerName|Amount|DeliveryStatus|
+------------------+------+--------------+
|     Donald Walker|783.04|      Returned|
|      Brandon Hall| 905.0|     Cancelled|
|      Donald Booth|657.96|      Returned|
|    Phillip Garcia|606.89|      Returned|
|      Valerie Gray| 77.87|     Delivered|
|       Amber Perez|352.37|     Cancelled|
|        Roy Martin|148.33|     Cancelled|
|    Carolyn Daniel| 14.09|     Delivered|
|       Patty Perez| 79.83|     Cancelled|
|Jonathan Wilkerson|882.68|     Cancelled|
|       Kevin Hurst|870.55|     Delivered|
| Anthony Rodriguez|921.73|     Cancelled|
|     Kyle Mcdonald|327.52|      Returned|
|    Jeffrey Chavez|676.02|     Cancelled|
|  Elizabeth Fowler| 47.06|     Delivered|
|     Tammy Sellers| 46.15|     Cancelled|
|     David Bradley|348.51|      Returned|
|       John Pierce|362.09|      Returned|
|   Jennifer Powers|684.26|     Cancelled|
|    George Chapman|251.89|       Pending|
+----------

In [44]:
# Task 9: Applying Functions (UDF)
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Function to tag orders
def order_tag(amount):
    if amount > 800:
        return "Big"
    elif amount > 300:
        return "Medium"
    else:
        return "Small"

# Register UDF
order_tag_udf = udf(order_tag, StringType())

# Apply UDF
df_tagged_orders = df.withColumn("OrderSize", order_tag_udf(col("Amount")))
df_tagged_orders.select("OrderID", "Amount", "OrderSize").show()


+-------+------+---------+
|OrderID|Amount|OrderSize|
+-------+------+---------+
|   2824|783.04|   Medium|
|   7912| 905.0|      Big|
|   4611|657.96|   Medium|
|   3547|606.89|   Medium|
|   8527| 77.87|    Small|
|   4150|352.37|   Medium|
|   5554|148.33|    Small|
|   2169| 14.09|    Small|
|   6313| 79.83|    Small|
|   6155|882.68|      Big|
|   9830|870.55|      Big|
|   9085|921.73|      Big|
|   2040|327.52|   Medium|
|   6573|676.02|   Medium|
|   2743| 47.06|    Small|
|   9837| 46.15|    Small|
|   6038|348.51|   Medium|
|   3060|362.09|   Medium|
|   4295|684.26|   Medium|
|   5061|251.89|    Small|
+-------+------+---------+
only showing top 20 rows

