## Superstore Sales Data Cleaning.

#### Importing python libraries for data analysis.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', 1000)

#### Importing superstore sales data into python.

In [None]:
orders = pd.read_excel('Superstore Sales Orders Data.xlsx')
people = pd.read_excel('Superstore Sales People Data.xlsx')
returns = pd.read_excel('Superstore Sales Returns Data.xlsx')

#### Reading Superstore Sales orders data.

In [None]:
orders

#### Getting information (shape & data types) on the orders data.

In [None]:
orders.info()

#### Getting column names.

In [None]:
orders.columns

### 1. Removal of duplicated rows.

#### Identifying duplicated rows.

In [None]:
orders.duplicated().sum()

In [None]:
orders.nunique()

In [None]:
# Row ID only had unique values, so I removed from the columns.

In [None]:
orders[orders.duplicated(subset = ['Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country/Region', 'City',
       'State/Province', 'Postal Code', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit'])]

In [None]:
orders[orders['Order ID'].isin(['US-2019-150119', 'CA-2019-153623'])]

In [None]:
orders = orders.drop_duplicates(subset = ['Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country/Region', 'City',
       'State/Province', 'Postal Code', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit'])

### 2. Data formatting & standardisation.

In [None]:
orders['Row ID'].nunique()

In [None]:
orders['Order ID'].sort_values().drop_duplicates()

In [None]:
orders['Order Date'].sort_values().drop_duplicates()

In [None]:
orders['Order Date'] = pd.to_datetime(orders["Order Date"], format="%m/%d/%Y")

In [None]:
orders['Ship Date'].sort_values().drop_duplicates()

In [None]:
orders['Ship Date'] = pd.to_datetime(orders["Ship Date"], format="%m/%d/%Y")

In [None]:
orders['Ship Mode'].sort_values().drop_duplicates()

In [None]:
orders['State/Province'].sort_values().drop_duplicates()

In [None]:
orders['Postal Code'].unique()

In [None]:
orders['Region'].sort_values().drop_duplicates()

In [None]:
orders['Product ID'].sort_values().drop_duplicates()

In [None]:
orders['Category'].sort_values().drop_duplicates()

In [None]:
def left(x):
    return x.str[0:3]

orders[['Product ID', 'Category']].apply(left)

In [None]:
orders['Sub-Category'].sort_values().drop_duplicates()

In [None]:
orders['Product Name'].sort_values().drop_duplicates()

In [None]:
orders['Sales'].sort_values().drop_duplicates()

In [None]:
orders['Quantity'].sort_values().drop_duplicates()

In [None]:
orders['Discount'].sort_values().drop_duplicates()

### 3. Imputation of blank or null values.

In [None]:
orders.isna().sum()

In [None]:
#No blank or null values.

### 4. Removal of duplicated and unusable columns.

In [None]:
orders = orders.drop('Row ID', axis=1)

### 5. Filtering dataframe

In [None]:
orders = orders[orders['Country/Region'] == 'United States']

In [None]:
orders

## Superstore Sales Data Analysis.

#### 1. Total Sales by Order Year.

In [None]:
orders["Order Year"] = orders["Order Date"].dt.year
orders.groupby("Order Year")["Sales"].sum().round(0)

#### 2. Total Quantity by Order Year.

In [None]:
orders["Order Year"] = orders["Order Date"].dt.year
orders.groupby("Order Year")["Quantity"].sum()

#### 3. Total Profit by Order Year.

In [None]:
orders["Order Year"] = orders["Order Date"].dt.year
orders.groupby("Order Year")["Sales"].sum().round(0)

#### 4. Total Orders by Order Year.

In [None]:
orders["Order Year"] = orders["Order Date"].dt.year
orders.groupby("Order Year")["Order ID"].nunique()

#### 5. Total Customers by Order Year.

In [None]:
orders["Order Year"] = orders["Order Date"].dt.year
orders.groupby("Order Year")["Customer ID"].nunique()

#### 6. Total Sales by Order Month & Year.

In [None]:
orders["Order Month"] = orders["Order Date"].dt.month_name()
orders.groupby(["Order Year", "Order Month"])["Sales"].sum().round(0)

#### 7. Total Quantity by Order Month & Year.

In [None]:
orders["Order Month"] = orders["Order Date"].dt.month_name()
orders.groupby(["Order Year", "Order Month"])["Quantity"].sum()

#### 8. Total Profit by Order Month & Year.

In [None]:
orders["Order Month"] = orders["Order Date"].dt.month_name()
orders.groupby(["Order Year", "Order Month"])["Profit"].sum().round(0)

#### 9. Total Orders by Order Month & Year.

In [None]:
orders["Order Month"] = orders["Order Date"].dt.month_name()
orders.groupby(["Order Year", "Order Month"])["Order ID"].nunique()

#### 10. Total Customers by Order Month & Year.

In [None]:
orders["Order Month"] = orders["Order Date"].dt.month_name()
orders.groupby(["Order Year", "Order Month"])["Customer ID"].nunique()