In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

superstore_file = pd.read_excel("/content/drive/MyDrive/GENAI/Week2/Day5/US Superstore data.xls")
df_superstore = pd.DataFrame(superstore_file)
df_superstore.head()

In [None]:
df_superstore.info()

In [None]:
df_superstore.drop_duplicates(inplace=True)

In [None]:
columns = df_superstore.select_dtypes(include='number').columns

outliers = pd.DataFrame()

for col in columns:
  Q1 = df_superstore[col].quantile(0.25)
  Q3 = df_superstore[col].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  outlier = df_superstore[(df_superstore[col] < lower_bound) | (df_superstore[col] > upper_bound)]
  outlier['outliers'] = col
  outliers = pd.concat([outliers, outlier])

outliers.drop_duplicates(inplace=True)
outliers

In [None]:
df_superstore.drop(outliers.index, inplace = True)
df_superstore.head()

In [None]:
df_superstore_cleaned = df_superstore.copy()

In [None]:
df_superstore_reduced = df_superstore_cleaned.drop(["Order ID", "Customer ID", "Product ID"], axis = 1)
df_superstore_reduced.head()

In [None]:
df_superstore_reduced["Profit Margin"] = (df_superstore_reduced["Profit"] / df_superstore_reduced["Sales"]) * 100
df_superstore_reduced["Order Year"] = df_superstore_reduced["Order Date"].dt.year
df_superstore_reduced["Order Month"] = df_superstore_reduced["Order Date"].dt.month
df_superstore_reduced["Ship Year"] = df_superstore_reduced["Ship Date"].dt.year
df_superstore_reduced["Ship Month"] = df_superstore_reduced["Ship Date"].dt.month

df_superstore_reduced.head()

In [None]:
df = df_superstore_reduced.drop(["Order Date", "Ship Date"], axis = 1)
df.head()

Which states have the most sales?

In [None]:
most_sales = df.groupby("State")["Sales"].sum().sort_values(ascending = False).head(1)
most_sales

What is the difference between New York and California in terms of sales and profit?

In [None]:
filtered_df = df[(df["State"] == "New York") | (df["State"] == "California")]
state_sales = filtered_df.groupby("State")[["Sales", "Profit"]].sum()
state_sales

Who is an outstanding customer in New York?

In [None]:
filtered_df = df[(df["State"] == "New York")]
top_customer = filtered_df.groupby("Customer Name")["Sales"].sum().sort_values(ascending = False).head(1)
top_customer

Are there any differences among states in profitability?

In [None]:
import matplotlib.pyplot as plt
import ipywidgets as widgets


category_dropdown = widgets.Dropdown(
    options=df['Category'].unique(),
    description='Category:'
)

def update_chart(selected_category):

    filtered_df = df[df['Category'] == selected_category]

    grouped = filtered_df.groupby('State')['Profit'].sum().reset_index()

    plt.figure(figsize=(10, 6))
    plt.bar(grouped["State"], grouped["Profit"])
    plt.title(f'Sales per state for {selected_category}')
    plt.xlabel('State')
    plt.ylabel('Total Sales')
    plt.xticks(rotation=90)
    plt.legend(title=selected_category)
    plt.show()


widgets.interactive(update_chart, selected_category=category_dropdown)

 Can we apply Pareto principle to customers and Profit ?

In [None]:
grouped = df.groupby('Customer Name')['Profit'].sum().reset_index()
grouped = grouped.sort_values(by='Profit', ascending=False)
grouped

In [None]:
grouped.reset_index(drop=True, inplace=True)

In [None]:
import numpy as np

n_customer = grouped.shape[0]
total_sales = grouped['Profit'].sum()

grouped['cum_Profit'] = grouped["Profit"].cumsum()
grouped['cum_Pofit_%'] = grouped['cum_Profit']/total_sales * 100
grouped["customer_%"] =(grouped.index + 1)/n_customer * 100
grouped

In [None]:
grouped = grouped[grouped['cum_Pofit_%'] >= 80].head(1)
grouped

No, 20% of customers do not contribute to 80% of the profit

What are the Top 20 cities by Sales ?

In [None]:
grouped = df.groupby('City')['Sales'].sum().reset_index().sort_values(by = 'Sales', ascending=False).head(20)
grouped.reset_index(drop=True, inplace=True)
grouped

In [None]:
grouped2 = df.groupby('City')['Profit'].sum().reset_index().sort_values(by = 'Profit', ascending=False).head(20)
grouped2.reset_index(drop=True, inplace=True)
grouped2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f, (ax1, ax2) = plt.subplots(2,1, figsize=(10,9))
ax1.scatter(x = 'City', y = 'Sales', data = grouped)
ax1.set_title('Sales by City')
ax1.tick_params(rotation=90)
ax2.scatter(x = 'City', y = 'Profit', data = grouped2)
ax2.set_title('Profit by City')
ax2.tick_params(rotation=90)
plt.subplots_adjust(hspace=0.5)
plt.show()

These plots show that although Los Angeles generates the highest sales, it ranks only second in terms of profit, behind New York City. This pattern is observed in several other cities as well, where higher sales do not necessarily translate into higher profits — likely due to lower profit margins.

What are the Top 20 customers by Sales?

In [None]:
most_sales = df.groupby("Customer Name")["Sales"].sum().sort_values(ascending = False).head(20)
most_sales

Plot the Cumulative curve in Sales by Customers. Can we apply Pareto principle to customers and Sales ?

In [None]:
plt.plot(most_sales)
plt.title('Cumulative curve in Sales by Customers')
plt.xlabel('Customers')
plt.ylabel('Sales')
plt.xticks(rotation=90)
plt.show()

In [None]:
grouped = df.groupby('Customer Name')['Sales'].sum().reset_index()
grouped = grouped.sort_values(by='Sales', ascending=False)
grouped

In [None]:
grouped.reset_index(drop=True, inplace=True)

In [None]:

n_customer = grouped.shape[0]
total_sales = grouped['Sales'].sum()

grouped['cum_sales'] = grouped["Sales"].cumsum()
grouped['cum_sales_%'] = grouped['cum_sales']/total_sales * 100
grouped["customer_%"] =(grouped.index + 1)/n_customer * 100
grouped

In [None]:
grouped = grouped[grouped['cum_sales_%'] >= 80].head(1)
grouped

No, 20% of customers do not contribute to 80% of the sales

Based on the analysis, make decisions on which states and cities to prioritize for marketing strategies.

According to the analysis, it would be more effective from a marketing perspective to focus on New York, Los Angeles, San Francisco, and Seattle, as these cities generate the highest sales and profits.