## Import Required Libraries

In [30]:
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()



In [31]:
from google.colab import drive

# Mount Google Drive with a longer timeout
drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

Mounted at /content/drive


In [39]:
# File Path
path_customer_orders="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/customer-orders.csv"
path_order_items="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/order_items.csv"
path_orders="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/orders.csv"
path_products="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/products.csv"
path_stores="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/stores.csv"
path_customers="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/customers.csv"

path_advw_customer="/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/AdventureWorks Customer Lookup.csv"
path_advw_product_catagory="/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/AdventureWorks Product Categories Lookup.csv"



In [63]:
ls drive/MyDrive/'Colab Notebooks'/dataSet/AdventureWorksRawData/SalesData/

'AdventureWorks Sales Data 2020.csv'  'AdventureWorks Sales Data 2022.csv'
'AdventureWorks Sales Data 2021.csv'


## Define the Schema

In [33]:
df_product = spark.read.csv(path="/content/drive/MyDrive/Colab Notebooks/dataSet/customerOrder/products.csv", header=True, inferSchema=True)
df_product.printSchema()

root
 |-- PRODUCT_ID: integer (nullable = true)
 |-- PRODUCT_NAME: string (nullable = true)
 |-- UNIT_PRICE: double (nullable = true)



In [None]:
df_customer = spark.read.csv(path="/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/AdventureWorks Customer Lookup.csv", header=True, inferSchema=True)
df_customer.printSchema()
df_customer.show(5)

In [53]:
#Define the schema for the CSV file]

custom_schema = StructType([
    StructField("CustomerKey", IntegerType(), True),
    StructField("Prefix", StringType(), True),
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("BirthDate", DateType(), True),
    StructField("MaritalStatus", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("EmailAddress", StringType(), True),
    StructField("AnnualIncome", FloatType(), True),
    StructField("TotalChildren", IntegerType(), True),
    StructField("EducationLevel", StringType(), True),
    StructField("Occupation", StringType(), True),
    StructField("HomeOwner", StringType(), True)
])
# Read the CSV file with the custom schema
df_customers_data = spark.read.csv(path="/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/AdventureWorks Customer Lookup.csv", header=True, schema=custom_schema)
df_customers_data.show(5)

+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+
|CustomerKey|Prefix|FirstName|LastName| BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|
+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+
|      11000|   MR.|      JON|    YANG|1966-04-08|            M|     M|jon24@adventure-w...|     90000.0|            2|     Bachelors|Professional|        Y|
|      11001|   MR.|   EUGENE|   HUANG|1965-05-14|            S|     M|eugene10@adventur...|     60000.0|            3|     Bachelors|Professional|        N|
|      11002|   MR.|    RUBEN|  TORRES|1965-08-12|            M|     M|ruben35@adventure...|     60000.0|            3|     Bachelors|Professional|        Y|
|      11003|   MS.|  CHRISTY|     ZHU|1968-02-15|  

In [74]:
# Load Multiple CSV Files

path_sales_2020 = "/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/SalesData/AdventureWorks Sales Data 2020.csv"
path_sales_2022 = "/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/SalesData/AdventureWorks Sales Data 2022.csv"
path_sales_2021 = "/content/drive/MyDrive/Colab Notebooks/dataSet/AdventureWorksRawData/SalesData/AdventureWorks Sales Data 2021.csv"

file_paths = [path_sales_2020, path_sales_2022,path_sales_2021]
df_multi_file = spark.read.csv(file_paths, header=True, inferSchema=True)
df_multi_file.show()
# Print the schema of the DataFrame
df_multi_file.printSchema()


+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
| OrderDate| StockDate|OrderNumber|ProductKey|CustomerKey|TerritoryKey|OrderLineItem|OrderQuantity|
+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
|2022-01-01|2021-12-13|    SO61285|       529|      23791|           1|            2|            2|
|2022-01-01|2021-09-24|    SO61285|       214|      23791|           1|            3|            1|
|2022-01-01|2021-09-04|    SO61285|       540|      23791|           1|            1|            1|
|2022-01-01|2021-09-28|    SO61301|       529|      16747|           1|            2|            2|
|2022-01-01|2021-10-21|    SO61301|       377|      16747|           1|            1|            1|
|2022-01-01|2021-10-23|    SO61301|       540|      16747|           1|            3|            1|
|2022-01-01|2021-09-04|    SO61269|       215|      11792|           4|            1|            1|


DataFrame[OrderDate: date, StockDate: date, OrderNumber: string, ProductKey: int, CustomerKey: int, TerritoryKey: int, OrderLineItem: int, OrderQuantity: int]

In [75]:
# Display the DataFrame in a tabular format
display(df_multi_file)

DataFrame[OrderDate: date, StockDate: date, OrderNumber: string, ProductKey: int, CustomerKey: int, TerritoryKey: int, OrderLineItem: int, OrderQuantity: int]