In [None]:
# Importing libraries
import numpy as np               #NumPy :Array operations
import pandas as pd              #Pandas : Data manipulation
import matplotlib.pyplot as plt  #Matplotlib : Visualization  
import seaborn as sns            #Seaborn : Statistical Visualization
import mysql.connector           #MySQL Connector : Database interaction
 

In [None]:
# Load datasets
stores_data = pd.read_csv('stores.csv')
sales_data = pd.read_csv('sales.csv')
products_data = pd.read_csv('products.csv')
customer_data = pd.read_csv('customers.csv', encoding='latin1') 
exchange_rates_data = pd.read_csv('exchange_rates.csv')

# Inspect datasets
datasets = {'Stores': stores, 'Sales': sales, 'Products': products, 'Customers': customers, 'Exchange Rates': exchange_rates}

for name, data in datasets.items():
    print(f"Dataset: {name}")
    print(data.info())
    print(data.head(), "\n")


In [None]:
DATA CLEANING

In [None]:
#Convert to datetime and then format to 'YY/MM/DD'
sales_data['Order Date'] = pd.to_datetime(sales_data['Order Date'], format="%m/%d/%Y", errors='coerce').dt.date

# fill the missing values
sales_data["Delivery Date"]=sales_data["Delivery Date"].fillna(0)
sales_data.info()

In [None]:
#customer_data

# Handle missing 'State Code' (replace with 'Unknown' or other suitable method)
customer_data['State Code'] = customer_data['State Code'].fillna('Unknown')

# Convert Birthday to datetime format if it's not already in datetime
customer_data['Birthday'] = pd.to_datetime(customer_data['Birthday'], format='%m/%d/%Y')


# Optionally, check for missing values
print(customer_data.isnull().sum())


In [None]:
#products_data
import re

# Remove dollar signs and commas from 'Unit Cost USD' and 'Unit Price USD' columns
products_data["Unit Cost USD"] = products_data["Unit Cost USD"].replace({"$": "", ",": ""}, regex=True)
products_data["Unit Price USD"] = products_data["Unit Price USD"].replace({"$": "", ",": ""}, regex=True)

# Convert the 'Unit Cost USD' and 'Unit Price USD' columns to float for calculations
products_data["Unit Cost USD"] = products_data["Unit Cost USD"].astype(float)
products_data["Unit Price USD"] = products_data["Unit Price USD"].astype(float)

# Optionally, check for missing values
print(products_data.isnull().sum())


In [None]:
#stores_data
# Handle missing values in 'Square Meters'
stores_data['Square Meters'] = stores_data['Square Meters'].fillna(stores_data['Square Meters'].mean())  # Fill with the mean value

# Convert 'Open Date' to datetime format with dayfirst=False (default is False)
stores_data['Open Date'] = pd.to_datetime(stores_data['Open Date'], dayfirst=False)

# Check data types and missing values after handling
print(stores_data.info())

In [None]:
#Exchange_rates_data
# Rename the 'Currency' column to 'Currency Code' for consistency
exchange_rates_data.rename(columns={"Currency": "Currency Code"}, inplace=True)

# Convert Date to datetime format first (if not already done)
exchange_rates_data['Date'] = pd.to_datetime(exchange_rates_data['Date'], errors='coerce')

# Convert Date to m/d/y format as a string
exchange_rates_data['Date'] = exchange_rates_data['Date'].dt.strftime('%m/%d/%Y')

#Check data types and missing values after handling
print(exchange_rates_data.info())

In [None]:
# Data Distribution & Summary Statistics

# Summary statistics for numerical columns
print("stores_data:", stores_data.describe())
print("sales_data:", sales_data.describe())
print("products_data:", products_data.describe())
print("customer_data:", customer_data.describe())
print("exchange_rates_data:", exchange_rates_data.describe())


In [None]:
#Missing Data Handling

# Check for missing values
print(stores_data.isnull().sum())
print(sales_data.isnull().sum())
print(products_data.isnull().sum())
print(customer_data.isnull().sum())
print(exchange_rates_data.isnull().sum())


In [None]:
#Convert Data Types
# Convert 'Open Date' in stores_data to datetime
stores_data['Open Date'] = pd.to_datetime(stores_data['Open Date'])

# Convert 'Birthday' in customer_data to datetime
customer_data['Birthday'] = pd.to_datetime(customer_data['Birthday'])

# Convert 'Exchange' rate to float
exchange_rates_data['Exchange'] = exchange_rates_data['Exchange'].astype(float)


In [None]:
#Merge Datasets for Analysis
# Merge sales data with product data to include product details in the sales data
sales_product_data = sales_data.merge(products_data, how='inner', on='ProductKey')

# Check the first few rows of the merged data
sales_product_data.head()


In [None]:
# Merge the sales_product_data with customer data to include customer details
full_sales_data = sales_product_data.merge(customer_data, how='inner', on='CustomerKey')

# Check the first few rows of the merged data
full_sales_data.head()


In [None]:
# Merge sales data with stores data to analyze performance by store
sales_store_data = full_sales_data.merge(stores_data, how='inner', on='StoreKey')

# Check the first few rows of the merged data
sales_store_data.head()


In [None]:
# Compute SalesAmount as Quantity * Unit Price USD
full_data['SalesAmount'] = full_data['Quantity'] * full_data['Unit Price USD']

# Verify that the SalesAmount column is now present
print(full_data.columns)


In [None]:
# Calculate total sales per customer
total_sales_per_customer = full_data.groupby('CustomerKey')['SalesAmount'].sum().reset_index()

# Sort customers by total spending
total_sales_per_customer = total_sales_per_customer.sort_values(by='SalesAmount', ascending=False)

# Display the results
print("Total Sales per Customer:")
print(total_sales_per_customer.head(10))


visualization

In [None]:
 # Set a refined style for all plots
sns.set(style="whitegrid")

# a. Total Sales by Store
plt.figure(figsize=(12, 7))
sales_plot = sns.barplot(x='StoreKey', y='SalesAmount', data=total_sales_by_store, hue=None, palette='Blues_d')
sales_plot.set_title('Total Sales by Store', fontsize=16, weight='bold')
sales_plot.set_xlabel('Store Key', fontsize=12)
sales_plot.set_ylabel('Total Sales Amount (USD)', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Horizontal Bar chart for Top 10 Best-Selling Products
plt.figure(figsize=(12, 7))
sns.barplot(x='SalesAmount', y='ProductKey', data=top_selling_products.head(10), palette='Greens_d')
plt.title('Top 10 Best-Selling Products', fontsize=16, weight='bold')
plt.xlabel('Total Sales Amount (USD)', fontsize=12)
plt.ylabel('Product Key', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
# c. Customer Spending Behavior (Total Sales per Customer)
plt.figure(figsize=(12, 7))
customer_spending_plot = sns.barplot(x='CustomerKey', y='SalesAmount', data=total_sales_per_customer.head(10), hue=None, palette='coolwarm')
customer_spending_plot.set_title('Top 10 Customers by Total Spending', fontsize=16, weight='bold')
customer_spending_plot.set_xlabel('Customer Key', fontsize=12)
customer_spending_plot.set_ylabel('Total Sales Amount (USD)', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
#Multivariate Analysis with Figure Size
# Correlation heatmap
plt.figure(figsize=(10, 8))  # Set figure size
correlation_matrix = full_data[['SalesAmount', 'Quantity', 'Unit Price USD', 'Square Meters']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
plt.title('Correlation Heatmap', fontsize=16, weight='bold')
plt.show()

# Scatterplot Matrix (Pairplot) to visualize relationships between multiple continuous variables
sns.pairplot(full_data[['SalesAmount', 'Quantity', 'Unit Price USD', 'Square Meters']])
plt.figure(figsize=(12, 12))  # Set figure size
plt.show()

# Linear regression plot between Sales Amount and Quantity
plt.figure(figsize=(10, 6))  # Set figure size
sns.lmplot(x='Quantity', y='SalesAmount', data=full_data, aspect=2, height=6)
plt.title('Sales Amount vs. Quantity Sold', fontsize=16, weight='bold')
plt.show()
