# Descriptive Analysis
___

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv(r'D:\PycharmProjects\Pacmann\Probability\Retail_Sales_Analysis\data\train_processed.csv')
data.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,2017-11-08,2017-11-11,second class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,south,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,2017-11-08,2017-11-11,second class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,south,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,2017-06-12,2017-06-16,second class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,west,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,2016-10-11,2016-10-18,standard class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,south,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,2016-10-11,2016-10-18,standard class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,south,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368


### 1. Set and Counting Analysis

Identify Best-Selling Products

* Grouping:

Group the data by Product ID or Name.

* Counting:

Count the total quantity sold for each product.

* Ranking:

Sort the products by the highest sales volume to determine the best-selling products.

In [21]:
# Grouping and Counting: Identify best-selling products
product_sales = data.groupby('Product Name')['Sales'].sum().reset_index()

# Ranking products by total sales
product_sales_sorted = product_sales.sort_values(by='Sales', ascending=False).reset_index(drop=True)

# Add a ranking column
product_sales_sorted['Rank'] = product_sales_sorted.index + 1
product_sales_sorted['Sales'] = product_sales_sorted['Sales'].round(2)

# Display the top 10 best-selling products
top_10_products = product_sales_sorted.head(10)
top_10_products

Unnamed: 0,Product Name,Sales,Rank
0,Canon imageCLASS 2200 Advanced Copier,61599.82,1
1,Fellowes PB500 Electric Punch Plastic Comb Bin...,27453.38,2
2,Cisco TelePresence System EX90 Videoconferenci...,22638.48,3
3,HON 5400 Series Task Chairs for Big and Tall,21870.58,4
4,GBC DocuBind TL300 Electric Binding System,19823.48,5
5,GBC Ibimaster 500 Manual ProClick Binding System,19024.5,6
6,Hewlett Packard LaserJet 3310 Copier,18839.69,7
7,HP Designjet T520 Inkjet Large Format Printer ...,18374.9,8
8,GBC DocuBind P400 Electric Binding System,17965.07,9
9,High Speed Automatic Electric Letter Opener,17030.31,10


Determining Peak Sales Period Time Grouping

* Group sales data based on time periods (e.g., daily, weekly, monthly, yearly).

* Counting: Calculate the total sales for each period.

* Peak Identification: Determine the period with the highest sales.








In [14]:
# convert 'Order Date' to datetime to format
data['Order Date'] = pd.to_datetime(data['Order Date'])

# Add columns for various periods (day, week, month, year)
data['Year'] = data['Order Date'].dt.year
data['Month'] = data['Order Date'].dt.to_period('M')
data['Week'] = data['Order Date'].dt.to_period('W')

# group by each period and calculate total sales
sales_daily = data.groupby(data['Order Date'].dt.date)['Sales'].sum()
sales_weekly = data.groupby(data['Week'])['Sales'].sum()
sales_monthly = data.groupby(data['Month'])['Sales'].sum()
sales_yearly = data.groupby(data['Year'])['Sales'].sum()

# identify periods with the highest sales
peak_daily = sales_daily.idxmax(), sales_daily.max()
peak_weekly = sales_weekly.idxmax(), sales_weekly.max()
peak_monthly = sales_monthly.idxmax(), sales_monthly.max()
peak_yearly = sales_yearly.idxmax(), sales_yearly.max()

In [15]:
# create dataframe
peak_sales_df = pd.DataFrame({
    'Period' : ['Daily', 'Weekly', 'Monthly', 'Yearly'],
    'Peak Period' : [
        peak_daily[0],
        peak_weekly[0],
        peak_monthly[0],
        peak_yearly[0]
    ],
    'Total Sales' : [
        peak_daily[1],
        peak_weekly[1],
        peak_monthly[1],
        peak_yearly[1]
    ]
})

In [17]:
peak_sales_df['Total Sales'] = peak_sales_df['Total Sales'].round(2)

In [18]:
peak_sales_df

Unnamed: 0,Period,Peak Period,Total Sales
0,Daily,2015-03-18,28106.72
1,Weekly,2015-03-16/2015-03-22,37703.66
2,Monthly,2018-11,117938.16
3,Yearly,2018,722052.02


* Customer Segmentation:

Sales analysis based on customer segments.

* Geographical Analysis:

Identify geographic areas with the highest sales.








In [23]:
# segment sales analysis
segment_sales = data.groupby('Segment')['Sales'].sum().sort_values(ascending=False)

# geographic sales analysis
geographic_sales = data.groupby('State')['Sales'].sum().sort_values(ascending=False)

# create DataFrame
segment_sales_df = segment_sales.reset_index()
segment_sales_df.columns = ['Customer Segment', 'Total Sales']
segment_sales_df['Total Sales'] = segment_sales_df['Total Sales'].round(2)

geographic_sales_df = geographic_sales.reset_index()
geographic_sales_df.columns = ['State', 'Total Sales']
geographic_sales_df['Total Sales'] = geographic_sales_df['Total Sales'].round(2)

In [24]:
segment_sales_df

Unnamed: 0,Customer Segment,Total Sales
0,Consumer,1148060.53
1,Corporate,688494.07
2,Home Office,424982.18


In [25]:
geographic_sales_df

Unnamed: 0,State,Total Sales
0,California,446306.46
1,New York,306361.15
2,Texas,168572.53
3,Washington,135206.85
4,Pennsylvania,116276.65
5,Florida,88436.53
6,Illinois,79236.52
7,Michigan,76136.07
8,Ohio,75130.35
9,Virginia,70636.72


### 2. Descriptive Statistic

In [4]:
data['Sales'].describe()

count     9800.000000
mean       230.769059
std        626.651875
min          0.444000
25%         17.248000
50%         54.490000
75%        210.605000
max      22638.480000
Name: Sales, dtype: float64

In [28]:
# Calculate the mean (expectation) of the total sales
mean_sales = data['Sales'].mean()

# calculate variance
variance_sales = ((data['Sales'] - mean_sales) ** 2).mean()

In [29]:
variance_sales

np.float64(392652.50156835176)