In [None]:
# Title : Analyzing Sales Data from Multiple File Formats

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import json

In [None]:
csv = pd.read_csv("./datasets/sales_data_sample.csv", encoding="cp1252")

In [None]:
ed = pd.read_excel("./datasets/Sample-Sales-Data.xlsx")

In [None]:
with open("./datasets/customers.json", "r") as json_file:
    json_data = json.load(json_file)

In [None]:
csv.tail()

In [None]:
csv.info()

In [None]:
csv.describe()

In [None]:
csv.dropna()

In [None]:
csv.drop_duplicates()

In [None]:
ed.head()

In [None]:
ed.tail()

In [None]:
ed.info()

In [None]:
ed.describe()

In [None]:
unified_data = pd.concat([csv, ed], ignore_index=True)

In [None]:
total_sales = unified_data['SALES'].sum()
print("Total Sales:", total_sales)

In [None]:
category_sales = unified_data.groupby('ORDERNUMBER')['SALES'].mean()
category_sales

In [None]:
category_counts = unified_data['SALES'].value_counts()
category_counts.plot(kind='bar')
plt.title('Product Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

In [None]:
total_sales_value = unified_data['Value'].sum()
print("Total Sales:", total_sales_value)

category_sales = unified_data.groupby('Sales_Rep_Name')['Value'].mean()

category_counts = unified_data['Value'].value_counts()

category_counts.plot(kind='bar')
plt.title('Sales Value Distribution')
plt.xlabel('Sales Value')
plt.ylabel('Count')
plt.show()

In [None]:
csv.groupby('MONTH_ID')['SALES'].sum().plot(kind='bar')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group sales by product line
sales_by_product_line = csv.groupby('PRODUCTLINE')['SALES'].sum().reset_index()

# Create the plot
plt.figure(figsize=(10, 6))
sns.barplot(data=sales_by_product_line, x='PRODUCTLINE', y='SALES', palette='viridis', hue='PRODUCTLINE')

# Add title and labels
plt.title('Total Sales by Product Line')
plt.xlabel('Product Line')
plt.ylabel('Total Sales')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout for tight fit
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
sales_by_product_line = csv.groupby('PRODUCTLINE')['SALES'].sum().reset_index()

# Create the barplot for total sales by product line
plt.figure(figsize=(10, 6))
sns.barplot(data=sales_by_product_line, x='PRODUCTLINE', y='SALES', hue='PRODUCTLINE', palette='viridis', legend=False)

plt.title('Total Sales by Product Line')
plt.xlabel('Product Line')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Create the boxplot for sales distribution by year
plt.figure(figsize=(12, 6))
sns.barplot(data=sales_by_product_line, x='PRODUCTLINE', y='SALES')

plt.title('Sales Distribution by Year')
plt.xlabel('Year')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:

sales_distribution = unified_data.groupby('PRODUCTLINE')['SALES'].sum()


plt.figure(figsize=(8, 8))
plt.pie(sales_distribution, labels=sales_distribution.index, autopct='%1.1f%%', startangle=140)
plt.title('Sales Distribution by Product Line')
plt.axis('equal')  
plt.show()


In [None]:
print(unified_data.columns)


In [None]:
print(sales_distribution)

In [None]:
# Here's a line-by-line explanation of the code provided:

# 1. **`csv.describe()`**  
#    - This outputs descriptive statistics of the `csv` DataFrame, showing summary metrics like mean, median, standard deviation, min, max, and quartile values for each numeric column.

# 2. **`csv.dropna()`**  
#    - Drops any rows with missing values (NaNs) in the `csv` DataFrame.

# 3. **`ed.head()` and `ed.tail()`**  
#    - `ed.head()` displays the first five rows of the `ed` DataFrame, while `ed.tail()` displays the last five rows.

# 4. **`ed.info()`**  
#    - Shows information about the `ed` DataFrame, including the number of entries, column names, data types, and memory usage.

# 5. **`ed.describe()`**  
#    - Similar to `csv.describe()`, this outputs summary statistics for each numeric column in `ed`.

# 6. **`unified_data = pd.concat([csv, ed], ignore_index=True)`**  
#    - Concatenates (joins) the `csv` and `ed` DataFrames into a single DataFrame `unified_data`, resetting the index to ensure unique indexing across the combined data.

# 7. **`total_sales = unified_data['SALES'].sum()`**  
#    - Calculates the total sum of the 'SALES' column in `unified_data`.

# 8. **`print("Total Sales:", total_sales)`**  
#    - Prints the total sales amount.

# 9. **`category_sales = unified_data.groupby('ORDERNUMBER')['SALES'].mean()`**  
#    - Groups `unified_data` by `ORDERNUMBER`, calculates the mean of `SALES` for each order, and stores it in `category_sales`.

# 10. **`category_counts = unified_data['SALES'].value_counts()`**  
#     - Counts occurrences of each unique sales value in the 'SALES' column and stores it in `category_counts`.

# 11. **`category_counts.plot(kind='bar')`**  
#     - Plots `category_counts` as a bar chart.

# 12. **`plt.title('Product Category Distribution')`**  
#     - Sets the title of the bar chart to "Product Category Distribution".

# 13. **`plt.xlabel('Category')` and `plt.ylabel('Count')`**  
#     - Labels the x-axis as "Category" and the y-axis as "Count".

# 14. **`plt.show()`**  
#     - Displays the bar chart.

# 15. **`total_sales_value = unified_data['Value'].sum()`**  
#     - Calculates the total sum of the 'Value' column in `unified_data`.

# 16. **`print("Total Sales:", total_sales_value)`**  
#     - Prints the total value sales amount.

# 17. **`category_sales = unified_data.groupby('Sales_Rep_Name')['Value'].mean()`**  
#     - Groups `unified_data` by `Sales_Rep_Name` and calculates the mean of 'Value' for each sales representative.

# 18. **`category_counts = unified_data['Value'].value_counts()`**  
#     - Counts occurrences of each unique value in the 'Value' column.

# 19. **`category_counts.plot(kind='bar')`**  
#     - Plots `category_counts` as a bar chart.

# 20. **`plt.title('Sales Value Distribution')`**  
#     - Sets the title to "Sales Value Distribution".

# 21. **`plt.xlabel('Sales Value')` and `plt.ylabel('Count')`**  
#     - Labels the x-axis as "Sales Value" and the y-axis as "Count".

# 22. **`plt.show()`**  
#     - Displays the bar chart.

# 23. **`csv.groupby('MONTH_ID')['SALES'].sum().plot(kind='bar')`**  
#     - Groups `csv` by 'MONTH_ID' and calculates the sum of 'SALES' for each month, then plots the result as a bar chart.

# 24. **`plt.show()`**  
#     - Displays the monthly sales bar chart.

# 25. **`import matplotlib.pyplot as plt` and `import seaborn as sns`**  
#     - Imports the matplotlib and seaborn libraries for plotting.

# 26. **`sales_by_product_line = csv.groupby('PRODUCTLINE')['SALES'].sum().reset_index()`**  
#     - Groups `csv` by 'PRODUCTLINE' and calculates the total sales for each product line, then resets the index for easier plotting.

# 27. **`plt.figure(figsize=(10, 6))`**  
#     - Creates a figure with dimensions 10x6 inches.

# 28. **`sns.barplot(data=sales_by_product_line, x='PRODUCTLINE', y='SALES', palette='viridis', hue='PRODUCTLINE')`**  
#     - Creates a bar plot of total sales by product line with the 'viridis' color palette, using product line names as hues for color distinction.

# 29. **`plt.title('Total Sales by Product Line')`**  
#     - Sets the title to "Total Sales by Product Line".

# 30. **`plt.xlabel('Product Line')` and `plt.ylabel('Total Sales')`**  
#     - Labels the x-axis as "Product Line" and the y-axis as "Total Sales".

# 31. **`plt.xticks(rotation=45)`**  
#     - Rotates the x-axis labels by 45 degrees for readability.

# 32. **`plt.tight_layout()`**  
#     - Adjusts layout to prevent label overlap.

# 33. **`plt.show()`**  
#     - Displays the bar plot.

# 34. **`sales_distribution = unified_data.groupby('PRODUCTLINE')['SALES'].sum()`**  
#     - Groups `unified_data` by 'PRODUCTLINE' and calculates the sum of 'SALES' for each product line.

# 35. **`plt.figure(figsize=(8, 8))`**  
#     - Creates a square figure for the pie chart.

# 36. **`plt.pie(sales_distribution, labels=sales_distribution.index, autopct='%1.1f%%', startangle=140)`**  
#     - Plots `sales_distribution` as a pie chart, showing percentages with one decimal place, starting the first wedge at 140 degrees.

# 37. **`plt.title('Sales Distribution by Product Line')`**  
#     - Sets the title to "Sales Distribution by Product Line".

# 38. **`plt.axis('equal')`**  
#     - Ensures the pie chart is a circle.

# 39. **`plt.show()`**  
#     - Displays the pie chart.

# 40. **`print(unified_data.columns)`**  
#     - Prints the column names of `unified_data`.

# 41. **`print(sales_distribution)`**  
#     - Prints the `sales_distribution` values for each product line.