## E-Commerce Sales Data Cleaning.

#### Importing python libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('future.no_silent_downcasting', True)

#### Importing e-commerce sales data into jupyter notebook.

In [None]:
orders = pd.read_excel("E-Commerce Sales Data.xlsx")

#### Reading the orders dataframe

In [None]:
orders

#### Getting information (shape & data types) on the columns orders dataframe.

In [None]:
orders.info()

#### Renaming orders columns.

In [None]:
orders = orders.rename(columns = {"Category Name":"Category", "Customer City":"City", "Customer Country":"Country", "Customer Fname":"Customer Name", "Customer Segment":"Segment", "Customer State":"State", "Customer Zipcode":"ZIP Code", "Order Region":"Region", "Order Quantity":"Quantity", "Profit Per Order":"Profit"})

#### Getting new column names.

In [None]:
orders.columns

### 1. Removal of duplicated rows.

In [None]:
orders.duplicated().sum()

### 2. Data formatting and standardisation.

In [None]:
orders["Additional Order items"].sort_values().unique()

In [None]:
orders["City"].sort_values().unique()

In [None]:
orders[orders["City"] == "CA"]

In [None]:
orders["Country"].sort_values().unique()

In [None]:
orders["Customer Id"].sort_values().unique()

In [None]:
orders["Market"].sort_values().unique()

In [None]:
orders["Order Date"].sort_values().unique()

In [None]:
orders["Order Date"] = pd.to_datetime(orders["Order Date"], format="%d-%m-%Y")

In [None]:
orders["Order Id"].sort_values().unique()

In [None]:
orders["Region"].sort_values().unique()

In [None]:
orders["Region"] = orders["Region"].replace("US Center", "Centre of USA")

In [None]:
orders[["Region","Market"]].sort_values(by="Market").drop_duplicates()

In [None]:
orders["Market"] = orders["Market"].replace({"LATAM": "Latin America", "Pacific Asia":"Asia", "USCA":"North America"})

In [None]:
orders["Order Item Total"].sort_values().unique()

In [None]:
orders["Quantity"].sort_values().unique()

In [None]:
orders["Product Price"].sort_values().unique()

In [None]:
orders["Profit Margin"].sort_values().unique()

In [None]:
orders["Profit Margin"] = orders["Profit Margin"].str.split("%").str.get(0).astype(float)/100

In [None]:
orders["Profit"].sort_values().unique()

In [None]:
orders["Sales"].sort_values().unique()

### 3. Imputation of blank/null values.

In [None]:
orders.isna().sum()

In [None]:
orders["Additional Order items"].sort_values().drop_duplicates()

In [None]:
orders[["Additional Order items", "Category"]].sort_values(by="Category").drop_duplicates()

In [None]:
orders["Additional Order items"] = orders["Additional Order items"].fillna(orders["Category"])

In [None]:
orders[orders["City"].isin(["CA"])]

In [None]:
orders["ZIP Code"] = orders["ZIP Code"].fillna(orders["State"])

In [None]:
orders["ZIP Code"] = orders["ZIP Code"].astype(int)

In [None]:
orders[orders["State"].isin([91732, 95758])]

In [None]:
orders["State"][orders["State"].isin([91732, 95758])] = np.NaN

In [None]:
orders[orders["City"] == "CA"]

In [None]:
orders["State"] = orders["State"].fillna(orders["City"])

In [None]:
orders["City"][orders["City"] == "CA"] = np.NaN

In [None]:
orders[orders["City"].isna()]

In [None]:
orders[["City","ZIP Code"]][orders["ZIP Code"].isin([91732,95758])].drop_duplicates()

In [None]:
new_orders = orders[orders["ZIP Code"].isin([91732,95758])]

In [None]:
new_orders = new_orders[["City", "ZIP Code"]].merge(new_orders[["City", "ZIP Code"]], how="inner", on="ZIP Code").drop_duplicates()

In [None]:
new_orders = new_orders[~new_orders["City_y"].isna()]

In [None]:
new_orders = new_orders["City_x"].fillna(new_orders["City_y"])

In [None]:
new_orders

### 4. Removal of redundant/irrelevant columns

In [None]:
orders = orders.drop("Additional Order items", axis = 1)

In [None]:
orders[["Customer Id", "Order Customer Id"]].sort_values(by="Customer Id").drop_duplicates()

In [None]:
(orders["Customer Id"] != orders["Order Customer Id"]).any()

In [None]:
orders = orders.drop("Order Customer Id", axis = 1)

## E-Commerce Sales Data Analysis.

#### 1. Key Performance Indicators