In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../data/ecommerce_data.csv', encoding='latin1')

# Display the first few rows
print(df.head())

# Data cleaning
df.dropna(inplace=True)  # Remove rows with missing values
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])  # Convert to datetime
df['TotalSales'] = df['Quantity'] * df['UnitPrice']  # Calculate total sales

# Exploratory Data Analysis (EDA)
# 1. Sales over time
df.set_index('InvoiceDate', inplace=True)
monthly_sales = df['TotalSales'].resample('M').sum()

plt.figure(figsize=(10, 6))
monthly_sales.plot()
plt.title('Monthly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.show()

# 2. Top-selling products
top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 6))
top_products.plot(kind='bar')
plt.title('Top 10 Selling Products')
plt.xlabel('Product')
plt.ylabel('Quantity Sold')
plt.show()

# 3. Customer segmentation by country
country_sales = df.groupby('Country')['TotalSales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 6))
country_sales.plot(kind='bar')
plt.title('Top 10 Countries by Sales')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.show()