In [106]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs, when
from pyspark.sql.types import DoubleType

In [107]:
spark = SparkSession.builder.appName('supplyChain').getOrCreate()

In [108]:
#Read all csvs
orders  = spark.read.csv('orders.csv',header=True)
categories = spark.read.csv('category.csv',header=True)
customers = spark.read.csv('customers.csv',header=True)
departments = spark.read.csv('departments.csv',header=True)
shipment = spark.read.csv('shipment.csv',header=True)

In [109]:
# Go through every dataframe
shipment.show(5)

+---+--------+------------------------+-----------------------------+----------------+------------------+--------------------------+--------------+
|_c0|Order Id|Days for shipping (real)|Days for shipment (scheduled)| Delivery Status|Late_delivery_risk|shipping date (DateOrders)| Shipping Mode|
+---+--------+------------------------+-----------------------------+----------------+------------------+--------------------------+--------------+
|  0|   77202|                       3|                            4|Advance shipping|                 0|          02/03/2018 22:56|Standard Class|
|  1|   75939|                       5|                            4|   Late delivery|                 1|           1/18/2018 12:27|Standard Class|
|  2|   75938|                       4|                            4|Shipping on time|                 0|           1/17/2018 12:06|Standard Class|
|  3|   75937|                       3|                            4|Advance shipping|                 0|       

In [110]:
#Display all possible values in a column and crosscheck values. Show all values.
# unique_values_orders = orders.select("Market").distinct()
# unique_values_categories = categories.select("Category Name").distinct()
# unique_values_customers = customers.select("Customer State").distinct().show(customers.count(), truncate=False)
# unique_values_departments = departments.select("Department Name").distinct().show(departments.count(), truncate=False)
unique_values_shipment = shipment.select("Shipping Mode").distinct().show(shipment.count(), truncate=False)

+--------------+
|Shipping Mode |
+--------------+
|First Class   |
|Same Day      |
|Second Class  |
|Standard Class|
+--------------+



In [111]:
#Remove negative values in orders profit and benefit columns
orders = orders.withColumn("Order Item Profit Ratio", abs(col("Order Item Profit Ratio")))
orders = orders.withColumn("Benefit per order", abs(col("Benefit per order")))
orders.show(2)

+---+--------+------------+-------------+------+----------------+-------------------+------------------------+-----------------------+-------------------+------------+-----------------+--------+-----------------------+------------+-----------+
|_c0|Order Id|Product Name|Order Item Id| Sales|Order Item Total|Order Item Discount|Order Item Discount Rate|Order Item Profit Ratio|Order Item Quantity|Order Status|Benefit per order|    Type|order date (DateOrders)|      Market|Category Id|
+---+--------+------------+-------------+------+----------------+-------------------+------------------------+-----------------------+-------------------+------------+-----------------+--------+-----------------------+------------+-----------+
|  0|   77202|Smart watch |       180517|327.75|     314.6400146|        13.10999966|             0.039999999|            0.289999992|                  1|    COMPLETE|            91.25|   DEBIT|        1/31/2018 22:56|Pacific Asia|         73|
|  1|   75939|Smart watc

In [112]:
# Change customer state values 
old_value = 95758 
new_value = 'CA'
old_value1 = 91732 
new_value1 = 'CA'
customers = customers.withColumn('Customer State',when(customers['Customer State']==old_value, new_value).otherwise(customers['Customer State']))
customers = customers.withColumn('Customer State',when(customers['Customer State']==old_value1, new_value1).otherwise(customers['Customer State']))

In [113]:
#Change zipcodes from float to integers
customers = customers.withColumn('Customer Zipcode',col('Customer Zipcode').cast('int'))

In [114]:
# Change Datatypes for columns that aren't strings in orders dataframe
columns_to_cast = ['Order Id', 'Order Item Id','Sales', 'Order Item Total', 'Order Item Discount',
                   'Order Item Discount Rate', 'Order Item Profit Ratio', 'Order Item Quantity', 
                   'Benefit per order', 'order date (DateOrders)', 'Category Id']
new_data_types = ['int','int','double','double','double','double','double','int','double','timestamp','int']

for col_name, new_data_type in zip(columns_to_cast, new_data_types):
    orders = orders.withColumn(col_name, col(col_name).cast(new_data_type))
orders.dtypes

[('_c0', 'string'),
 ('Order Id', 'int'),
 ('Product Name', 'string'),
 ('Order Item Id', 'int'),
 ('Sales', 'double'),
 ('Order Item Total', 'double'),
 ('Order Item Discount', 'double'),
 ('Order Item Discount Rate', 'double'),
 ('Order Item Profit Ratio', 'double'),
 ('Order Item Quantity', 'int'),
 ('Order Status', 'string'),
 ('Benefit per order', 'double'),
 ('Type', 'string'),
 ('order date (DateOrders)', 'timestamp'),
 ('Market', 'string'),
 ('Category Id', 'int')]

In [115]:
# Change Datatypes for columns that aren't strings in categories
columns_to_cast = ['Category Id','Orders']
new_data_types = ['int','int']
for col_name, new_data_type in zip(columns_to_cast, new_data_types):
    categories = categories.withColumn(col_name, col(col_name).cast(new_data_type))
categories.dtypes

[('_c0', 'string'),
 ('Category Id', 'int'),
 ('Category Name', 'string'),
 ('Orders', 'int')]

In [116]:
columns_to_cast = ['Customer Id','Customer Zipcode','Order Id']
new_data_types = ['int','int','int']
for col_name, new_data_type in zip(columns_to_cast, new_data_types):
    customers = customers.withColumn(col_name, col(col_name).cast(new_data_type))
customers.dtypes

[('_c0', 'string'),
 ('Customer Id', 'int'),
 ('Customer Fname', 'string'),
 ('Customer Lname', 'string'),
 ('Customer City', 'string'),
 ('Customer Country', 'string'),
 ('Customer Segment', 'string'),
 ('Customer State', 'string'),
 ('Customer Street', 'string'),
 ('Customer Zipcode', 'int'),
 ('Order Id', 'int')]

In [117]:
columns_to_cast = ['Order Id','Department Id','Latitude','Longitude']
new_data_types = ['int','int',DoubleType(),DoubleType()]
for col_name, new_data_type in zip(columns_to_cast, new_data_types):
    departments = departments.withColumn(col_name, col(col_name).cast(new_data_type))
departments.dtypes

[('_c0', 'string'),
 ('Order Id', 'int'),
 ('Department Id', 'int'),
 ('Department Name', 'string'),
 ('Latitude', 'double'),
 ('Longitude', 'double')]

In [118]:
columns_to_cast = ['Order Id','Days for shipping (real)','Days for shipment (scheduled)',
                   'Late_delivery_risk','shipping date (DateOrders)']
new_data_types = ['int','int','int','int','timestamp']
for col_name, new_data_type in zip(columns_to_cast, new_data_types):
    shipment = shipment.withColumn(col_name, col(col_name).cast(new_data_type))
shipment.dtypes

[('_c0', 'string'),
 ('Order Id', 'int'),
 ('Days for shipping (real)', 'int'),
 ('Days for shipment (scheduled)', 'int'),
 ('Delivery Status', 'string'),
 ('Late_delivery_risk', 'int'),
 ('shipping date (DateOrders)', 'timestamp'),
 ('Shipping Mode', 'string')]