In [7]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('cafe_sales.csv')

# Data Cleaning
# 1. Handle missing values
data['Item'] = data['Item'].fillna('Unknown')
data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
data['Price Per Unit'] = pd.to_numeric(data['Price Per Unit'], errors='coerce')
data['Total Spent'] = pd.to_numeric(data['Total Spent'], errors='coerce')
data['Payment Method'] = data['Payment Method'].fillna('Unknown')
data['Location'] = data['Location'].fillna('Unknown')
data['Transaction Date'] = pd.to_datetime(data['Transaction Date'], errors='coerce')

# Drop rows where essential numeric columns are still null
data.dropna(subset=['Quantity', 'Price Per Unit', 'Total Spent', 'Transaction Date'], inplace=True)

# 2. Fix invalid values
data = data[data['Total Spent'] > 0]  # Removing invalid Total Spent values
data = data[data['Quantity'] > 0]    # Removing invalid Quantity values

# 3. Remove duplicates
data.drop_duplicates(inplace=True)

# Descriptive Analysis
descriptive_stats = data[['Quantity', 'Price Per Unit', 'Total Spent']].describe()

# Sales Analysis
total_sales = data['Total Spent'].sum()
average_order_value = data['Total Spent'].mean()
top_items = data['Item'].value_counts().head(10)

# Trend Analysis
data['Month'] = data['Transaction Date'].dt.to_period('M')
sales_trends = data.groupby('Month')['Total Spent'].sum()

# Payment Preferences
payment_preferences = data['Payment Method'].value_counts()

# Location Insights
location_sales = data.groupby('Location')['Total Spent'].sum()

# Output results
print("Descriptive Statistics:\n", descriptive_stats)
print("\nTotal Sales:", total_sales)
print("Average Order Value:", average_order_value)
print("\nTop 10 Items:\n", top_items)
print("\nSales Trends:\n", sales_trends)
print("\nPayment Preferences:\n", payment_preferences)
print("\nLocation Sales:\n", location_sales)

# Save cleaned dataset
cleaned_file_path = 'cafe_sales_cleaned.csv'
data.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to {cleaned_file_path}")


Descriptive Statistics:
           Quantity  Price Per Unit  Total Spent
count  8159.000000     8159.000000  8159.000000
mean      3.018507        2.948707     8.910957
std       1.417945        1.276822     5.993071
min       1.000000        1.000000     1.000000
25%       2.000000        2.000000     4.000000
50%       3.000000        3.000000     8.000000
75%       4.000000        4.000000    12.000000
max       5.000000        5.000000    25.000000

Total Sales: 72704.5
Average Order Value: 8.91095722515014

Top 10 Items:
 Item
Juice       972
Coffee      969
Cake        943
Salad       937
Sandwich    915
Smoothie    888
Cookie      881
Tea         877
UNKNOWN     270
Unknown     266
Name: count, dtype: int64

Sales Trends:
 Month
2023-01    6055.5
2023-02    5725.5
2023-03    6089.5
2023-04    6048.0
2023-05    5923.5
2023-06    6399.5
2023-07    6063.0
2023-08    6279.5
2023-09    5865.0
2023-10    6250.5
2023-11    5993.5
2023-12    6011.5
Freq: M, Name: Total Spent, dtype: flo