In [203]:
# import libraries
# read files
# check datatypes
# check for categorical value misspell
# Standardize categorical values (spelling, capitalization etc)
# Check for missing values
# Crosscheck all numerical values
# Check and modify column data types
# check for duplicates if necessary
# Use visualization for outliers
# Remove irrelevant columns
# Convert units to a consistent format

In [204]:
# import libraries
import pandas as pd

In [205]:
# Read all csv files
categories = pd.read_csv('category.csv')
customers = pd.read_csv('customers.csv')
departments = pd.read_csv('departments.csv')
orders = pd.read_csv('orders.csv')
shipment = pd.read_csv('shipment.csv')

In [206]:
# Sample of categories
categories.head()

Unnamed: 0.1,Unnamed: 0,Category Id,Category Name,Orders
0,46,2,Soccer,138
1,19,3,Baseball & Softball,632
2,49,4,Basketball,67
3,38,5,Lacrosse,343
4,39,6,Tennis & Racquet,328


In [207]:
# Create new column from old Unnamed: 0 column
categories['Rating by total orders'] = categories['Unnamed: 0']

In [208]:
# Check Category Name unique values
categories['Category Name'].unique()

array(['Soccer', 'Baseball & Softball', 'Basketball', 'Lacrosse',
       'Tennis & Racquet', 'Hockey', 'Cardio Equipment',
       'Strength Training', 'Fitness Accessories', 'Boxing & MMA',
       'Electronics', 'As Seen on  TV!', 'Cleats', "Men's Footwear",
       "Women's Apparel", "Girls' Apparel", 'Shop By Sport',
       "Men's Golf Clubs", "Women's Golf Clubs", 'Golf Apparel',
       'Golf Shoes', 'Golf Bags & Carts', 'Golf Gloves', 'Golf Balls',
       "Kids' Golf Clubs", 'Accessories', 'Trade-In', 'Camping & Hiking',
       'Hunting & Shooting', 'Fishing', 'Indoor/Outdoor Games',
       'Water Sports', 'Books ', 'Baby ', 'CDs ', 'Cameras ',
       "Children's Clothing", 'Computers', 'Consumer Electronics',
       'Crafts', 'DVDs', 'Garden', 'Health and Beauty', "Men's Clothing",
       'Music', 'Pet Supplies', 'Sporting Goods', 'Toys', 'Video Games',
       "Women's Clothing"], dtype=object)

In [209]:
# Check categories columns datatypes
categories.dtypes

Unnamed: 0                 int64
Category Id                int64
Category Name             object
Orders                     int64
Rating by total orders     int64
dtype: object

In [210]:
# Sample of customers
customers.head()

Unnamed: 0.1,Unnamed: 0,Customer Id,Customer Fname,Customer Lname,Customer City,Customer Country,Customer Segment,Customer State,Customer Street,Customer Zipcode,Order Id
0,0,20755,Cally,Holloway,Caguas,Puerto Rico,Consumer,PR,5365 Noble Nectar Island,725.0,77202
1,1,19492,Irene,Luna,Caguas,Puerto Rico,Consumer,PR,2679 Rustic Loop,725.0,75939
2,2,19491,Gillian,Maldonado,San Jose,EE. UU.,Consumer,CA,8510 Round Bear Gate,95125.0,75938
3,3,19490,Tana,Tate,Los Angeles,EE. UU.,Home Office,CA,3200 Amber Bend,90027.0,75937
4,4,19489,Orli,Hendricks,Caguas,Puerto Rico,Corporate,PR,8671 Iron Anchor Corners,725.0,75936


In [211]:
# Check customers columns datatype
customers.dtypes

Unnamed: 0            int64
Customer Id           int64
Customer Fname       object
Customer Lname       object
Customer City        object
Customer Country     object
Customer Segment     object
Customer State       object
Customer Street      object
Customer Zipcode    float64
Order Id              int64
dtype: object

In [212]:
# Check zipcode column for any similarity
trial = customers[customers['Customer Zipcode'] < 1000]
values = trial['Customer Country'].unique()
values 

array(['Puerto Rico'], dtype=object)

In [213]:
# Remove lagging '.0' from all values and add leading '00' to all values less than a 1000 in Customer Zipcode column
customers['Customer Zipcode'] = customers['Customer Zipcode'].astype(str)
numeric_values = pd.to_numeric(customers['Customer Zipcode'], errors='coerce')
condition = (numeric_values < 1000) & (~numeric_values.isna()) # values less than 100 and values that are not NA
customers.loc[condition,'Customer Zipcode'] = '00' + customers['Customer Zipcode']
customers['Customer Zipcode'] = customers['Customer Zipcode'].str.rstrip('.0')
customers['Customer Zipcode'].astype('object')
customers.head()

Unnamed: 0.1,Unnamed: 0,Customer Id,Customer Fname,Customer Lname,Customer City,Customer Country,Customer Segment,Customer State,Customer Street,Customer Zipcode,Order Id
0,0,20755,Cally,Holloway,Caguas,Puerto Rico,Consumer,PR,5365 Noble Nectar Island,725,77202
1,1,19492,Irene,Luna,Caguas,Puerto Rico,Consumer,PR,2679 Rustic Loop,725,75939
2,2,19491,Gillian,Maldonado,San Jose,EE. UU.,Consumer,CA,8510 Round Bear Gate,95125,75938
3,3,19490,Tana,Tate,Los Angeles,EE. UU.,Home Office,CA,3200 Amber Bend,90027,75937
4,4,19489,Orli,Hendricks,Caguas,Puerto Rico,Corporate,PR,8671 Iron Anchor Corners,725,75936


In [214]:
# Check customer state categorical values
customers['Customer State'].unique()

array(['PR', 'CA', 'NY', 'FL', 'MA', 'IL', 'MT', 'PA', 'MI', 'TX', 'DE',
       'GA', 'MD', 'OH', 'HI', 'NJ', 'WI', 'AZ', 'CO', 'MN', 'NC', 'NM',
       'OR', 'SC', 'VA', 'UT', 'WA', 'KY', 'WV', 'RI', 'CT', 'LA', 'TN',
       'DC', 'ND', 'MO', 'IN', 'ID', 'NV', 'KS', 'AR', 'OK', 'AL', 'IA',
       '95758', '91732'], dtype=object)

In [215]:
# Replace numerical value state and ZIP code
customers.loc[customers['Customer State'] == '91732', 'Customer State'] = 'CA'
customers.loc[customers['Customer State'] == '91732', 'Customer Zipcode'] = 91732

customers.loc[customers['Customer State'] == '95758', 'Customer State'] = 'CA'
customers.loc[customers['Customer State'] == '95758', 'Customer Zipcode'] = 95758


In [216]:
# Replace CA customer city categorical value 'CA'
customers[customers['Customer City'] == 'CA'] # Use to crosscheck values
customers.loc[customers['Customer City'] == 'CA', 'Customer Zipcode'] = '95758'
customers.loc[customers['Customer City'] == 'CA', 'Customer City'] = 'Elk Grove'

In [217]:
# Preview the departments table
departments.head()

Unnamed: 0.1,Unnamed: 0,Order Id,Department Id,Department Name,Latitude,Longitude
0,0,77202,2,Fitness,18.251453,-66.037056
1,1,75939,2,Fitness,18.279451,-66.037064
2,2,75938,2,Fitness,37.292233,-121.881279
3,3,75937,2,Fitness,34.125946,-118.291016
4,4,75936,2,Fitness,18.253769,-66.037048


In [218]:
departments.dtypes

Unnamed: 0           int64
Order Id             int64
Department Id        int64
Department Name     object
Latitude           float64
Longitude          float64
dtype: object

In [219]:
# Check categorical values for any errors
departments['Department Name'].unique()

array(['Fitness', 'Apparel', 'Golf', 'Footwear', 'Outdoors', 'Fan Shop',
       'Technology', 'Book Shop', 'Discs Shop', 'Pet Shop',
       'Health and Beauty '], dtype=object)

In [220]:
# Ensure all latitude values are in range
departments[(departments['Latitude'] < -90) | (departments['Latitude'] > 90)]

Unnamed: 0.1,Unnamed: 0,Order Id,Department Id,Department Name,Latitude,Longitude


In [221]:
# Ensure all longitude values are in range
departments[(departments['Longitude'] < -180) | (departments['Longitude'] > 180)]

Unnamed: 0.1,Unnamed: 0,Order Id,Department Id,Department Name,Latitude,Longitude


In [222]:
# Preview orders table
orders.head()

Unnamed: 0.1,Unnamed: 0,Order Id,Product Name,Order Item Id,Sales,Order Item Total,Order Item Discount,Order Item Discount Rate,Order Item Profit Ratio,Order Item Quantity,Order Status,Benefit per order,Type,order date (DateOrders),Market,Category Id
0,0,77202,Smart watch,180517,327.75,314.640015,13.11,0.04,0.29,1,COMPLETE,91.25,DEBIT,1/31/2018 22:56,Pacific Asia,73
1,1,75939,Smart watch,179254,327.75,311.359985,16.389999,0.05,-0.8,1,PENDING,-249.089996,TRANSFER,1/13/2018 12:27,Pacific Asia,73
2,2,75938,Smart watch,179253,327.75,309.720001,18.030001,0.06,-0.8,1,CLOSED,-247.779999,CASH,1/13/2018 12:06,Pacific Asia,73
3,3,75937,Smart watch,179252,327.75,304.809998,22.940001,0.07,0.08,1,COMPLETE,22.860001,DEBIT,1/13/2018 11:45,Pacific Asia,73
4,4,75936,Smart watch,179251,327.75,298.25,29.5,0.09,0.45,1,PENDING_PAYMENT,134.210007,PAYMENT,1/13/2018 11:24,Pacific Asia,73


In [223]:
# Check unique categorical values
orders['Type'].unique()

array(['DEBIT', 'TRANSFER', 'CASH', 'PAYMENT'], dtype=object)

In [224]:
# Check unique categorical values
orders['Order Status'].unique()

array(['COMPLETE', 'PENDING', 'CLOSED', 'PENDING_PAYMENT', 'CANCELED',
       'PROCESSING', 'SUSPECTED_FRAUD', 'ON_HOLD', 'PAYMENT_REVIEW'],
      dtype=object)

In [225]:
# Check unique categorical values
orders['Market'].unique()

array(['Pacific Asia', 'USCA', 'Africa', 'Europe', 'LATAM'], dtype=object)

In [226]:
# Check Data types
orders.dtypes

Unnamed: 0                    int64
Order Id                      int64
Product Name                 object
Order Item Id                 int64
Sales                       float64
Order Item Total            float64
Order Item Discount         float64
Order Item Discount Rate    float64
Order Item Profit Ratio     float64
Order Item Quantity           int64
Order Status                 object
Benefit per order           float64
Type                         object
order date (DateOrders)      object
Market                       object
Category Id                   int64
dtype: object

In [227]:
# Preview shipment table
shipment.head()

Unnamed: 0.1,Unnamed: 0,Order Id,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late_delivery_risk,shipping date (DateOrders),Shipping Mode
0,0,77202,3,4,Advance shipping,0,02/03/2018 22:56,Standard Class
1,1,75939,5,4,Late delivery,1,1/18/2018 12:27,Standard Class
2,2,75938,4,4,Shipping on time,0,1/17/2018 12:06,Standard Class
3,3,75937,3,4,Advance shipping,0,1/16/2018 11:45,Standard Class
4,4,75936,2,4,Advance shipping,0,1/15/2018 11:24,Standard Class


In [228]:
# Check unique categorical column values
shipment['Delivery Status'].unique()

array(['Advance shipping', 'Late delivery', 'Shipping on time',
       'Shipping canceled'], dtype=object)

In [229]:
# Check unique categorical column values
shipment['Shipping Mode'].unique()

array(['Standard Class', 'First Class', 'Second Class', 'Same Day'],
      dtype=object)

In [230]:
# Check column Datatypes
shipment.dtypes

Unnamed: 0                        int64
Order Id                          int64
Days for shipping (real)          int64
Days for shipment (scheduled)     int64
Delivery Status                  object
Late_delivery_risk                int64
shipping date (DateOrders)       object
Shipping Mode                    object
dtype: object

In [231]:
# Drop Unnamed: 0 column from every table
categories = categories.drop(columns='Unnamed: 0')
orders = orders.drop(columns='Unnamed: 0')
customers = customers.drop(columns='Unnamed: 0')
departments = departments.drop(columns='Unnamed: 0')
shipment = shipment.drop(columns='Unnamed: 0')

In [237]:
# Final check on a random column to verify
shipment.head()

Unnamed: 0,Order Id,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late_delivery_risk,shipping date (DateOrders),Shipping Mode
0,77202,3,4,Advance shipping,0,02/03/2018 22:56,Standard Class
1,75939,5,4,Late delivery,1,1/18/2018 12:27,Standard Class
2,75938,4,4,Shipping on time,0,1/17/2018 12:06,Standard Class
3,75937,3,4,Advance shipping,0,1/16/2018 11:45,Standard Class
4,75936,2,4,Advance shipping,0,1/15/2018 11:24,Standard Class


In [238]:
# Create new csv files for eda
categories.to_csv('clean_categories.csv')
orders.to_csv('clean_orders.csv')
customers.to_csv('clean_customers.csv')
departments.to_csv('clean_departments.csv')
shipment.to_csv('clean_shipment.csv')