# Shopping Mall Analysis

#### Import Packages

In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import plotly.figure_factory as ff

#### Read in the data

In [2]:
df = pd.read_csv('customer_shopping_data.csv')

df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
0,I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
1,I317333,C111565,Male,21,Shoes,3,1800.51,Debit Card,12/12/2021,Forum Istanbul
2,I127801,C266599,Male,20,Clothing,1,300.08,Cash,9/11/2021,Metrocity
3,I173702,C988172,Female,66,Shoes,5,3000.85,Credit Card,16/05/2021,Metropol AVM
4,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon


## 1. Overall sales and revenue:

#### The total sales for each shopping mall:

In [3]:
sales_by_mall = df.groupby("shopping_mall")["price"].sum().reset_index()

fig = px.bar(sales_by_mall, x="shopping_mall", y="price",
             title="Total Sales by Shopping Mall",
             labels={"shopping_mall": "Shopping Mall", "price": "Total Sales"})
fig.show()

#### The total sales for each shopping mall, with the bars stacked by category:

In [4]:
sales_by_mall = df.groupby(["shopping_mall", "category"])["price"].sum().reset_index()
fig = px.bar(sales_by_mall, x="shopping_mall", y="price", color="category",
             title="Total Sales by Shopping Mall and Category",
             labels={"shopping_mall": "Shopping Mall", "price": "Total Sales", "category": "Category"},
             barmode="stack")
fig.show()

#### The total sales for each category and shopping mall:

In [5]:
sales_by_category_mall = df.groupby(["category", "shopping_mall"])["price"].sum().reset_index()

fig = px.bar(sales_by_category_mall, x="category", y="price", color="shopping_mall",
             title="Total Sales by Category and Shopping Mall",
             labels={"category": "Category", "price": "Total Sales", "shopping_mall": "Shopping Mall"})
fig.show()

#### The total sales by category and shopping mall:

In [6]:
sales_by_category_mall = df.groupby(["category", "shopping_mall"])["price"].sum().reset_index()

fig = px.imshow(sales_by_category_mall.pivot("category", "shopping_mall", "price"),
                color_continuous_scale="YlGnBu",
                title="Total Sales by Category and Shopping Mall",
                labels={"x": "Shopping Mall", "y": "Category", "color": "Total Sales"})
fig.show()


In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.



#### The total sales by month:

In [7]:
df["invoice_date"] = pd.to_datetime(df["invoice_date"])
monthly_sales = df.groupby(pd.Grouper(key="invoice_date", freq="M"))["price"].sum().reset_index()
fig = px.line(monthly_sales, x="invoice_date", y="price",
              title="Total Sales by Month",
              labels={"invoice_date": "Month", "price": "Total Sales"})
fig.show()


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.



#### The monthly revenue by Shopping Mall:

In [8]:
df["invoice_date"] = pd.to_datetime(df["invoice_date"])  # Convert "invoice_date" column to datetime type

monthly_revenue_by_mall = df.groupby([pd.Grouper(key='invoice_date', freq='M'), "shopping_mall"])["price"].sum().reset_index()

fig = px.line(monthly_revenue_by_mall, x="invoice_date", y="price", color="shopping_mall",
              title="Monthly Revenue by Shopping Mall",
              labels={"invoice_date": "Date", "price": "Revenue", "shopping_mall": "Shopping Mall"})

fig.show()

## 2. Sales breakdown by category:

#### Breakdown of sales by category, broken down by gender:

In [9]:
sales_by_category_gender = df.groupby(["category", "gender"])["price"].sum().reset_index()

fig = px.sunburst(sales_by_category_gender, path=["gender", "category"], values="price",
                  title="Breakdown of Sales by Category and Gender",
                  labels={"gender": "Gender", "category": "Category", "price": "Total Sales"})

fig.update_layout(width=1000, height=800)

fig.show()

#### The total sales for each category, with the bars colored by payment method:

In [10]:
sales_by_category = df.groupby(["category", "payment_method"])["price"].sum().reset_index()

fig = px.bar(sales_by_category, x="category", y="price", color="payment_method",
             title="Total Sales by Category and Payment Method",
             labels={"category": "Category", "price": "Total Sales", "payment_method": "Payment Method"})
fig.show()

#### The distribution of price by category:


In [11]:
fig = px.box(df, x="category", y="price", color="category",
             title="Distribution of Price by Category",
             labels={"category": "Category", "price": "Price"})
fig.show()

#### Quantity sold by category:

> In this context, density refers to the probability density function of the distribution of quantity sold by category. It is a measure of how the probability mass is spread over the range of values for the quantity sold.


In [12]:
hist_data = [df[df["category"]=="Clothing"]["quantity"], df[df["category"]=="Technology"]["quantity"], df[df["category"]=="Shoes"]["quantity"], df[df["category"]=="Books"]["quantity"]]

group_labels = ["Clothing", "Technology", "Shoes", "Books"]

fig = ff.create_distplot(hist_data, group_labels, show_hist=False, show_rug=False)

fig.update_layout(title="Distribution of Quantity Sold by Category",
xaxis_title="Quantity",
yaxis_title="Density")

fig.show()

#### Sales by category and day of the week:

In [13]:
df["invoice_date"] = pd.to_datetime(df["invoice_date"])
df["day_of_week"] = df["invoice_date"].dt.day_name()

sales_by_category_day = df.groupby(["category", "day_of_week"])["price"].sum().reset_index()

fig = px.line(sales_by_category_day, x="day_of_week", y="price", color="category",
              title="Total Sales by Category and Day of the Week",
              labels={"day_of_week": "Day of the Week", "price": "Total Sales", "category": "Category"})

fig.show()


#### Average price per unit by category and shopping mall:

In [14]:
avg_price_by_category_mall = df.groupby(["category", "shopping_mall"])["price"].mean().reset_index()

fig = px.bar(avg_price_by_category_mall, x="category", y="price", color="shopping_mall",
             title="Average Price per Unit by Category and Shopping Mall",
             labels={"category": "Category", "price": "Average Price per Unit", "shopping_mall": "Shopping Mall"})

fig.show()

#### Average price per unit by category and payment method:

In [15]:
avg_price_by_category_payment = df.groupby(["category", "payment_method"])["price"].mean().reset_index()

fig = px.bar(avg_price_by_category_payment, x="category", y="price", color="payment_method",
             title="Average Price per Unit by Category and Payment Method",
             labels={"category": "Category", "price": "Average Price per Unit", "payment_method": "Payment Method"})

fig.show()


## 3. Customer demographics:

#### Customer ages, colored by gender:

In [16]:
fig = px.histogram(df, x="age", color="gender",
                   title="Age Distribution by Gender",
                   labels={"age": "Age", "gender": "Gender", "count": "Count"})
fig.show()


#### The distribution of customer ages:

In [17]:
fig = px.histogram(df, x="age", nbins=20,
                   title="Distribution of Customer Ages",
                   labels={"age": "Age", "count": "Number of Customers"})
fig.show()

#### Sales by age group and gender:

In [18]:
bins = [0, 18, 25, 35, 50, 65, np.inf]
labels = ["Under 18", "18-25", "25-35", "35-50", "50-65", "65+"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels)

sales_by_age_gender = df.groupby(["age_group", "gender"])["price"].sum().reset_index()

fig = px.bar(sales_by_age_gender, x="age_group", y="price", color="gender",
             title="Total Sales by Age Group and Gender",
             labels={"age_group": "Age Group", "price": "Total Sales", "gender": "Gender"})

fig.show()


## 4. Payment methods

#### The distribution of payment methods:

In [19]:
payment_methods = df["payment_method"].value_counts().reset_index()
payment_methods.columns = ["method", "count"]
fig = px.pie(payment_methods, values="count", names="method",
             title="Payment Method Distribution",
             labels={"count": "Count", "method": "Payment Method"})
fig.show()