# *TASK 1 : Exploratory Data Analysis (EDA) and Business Insights*

**Importing necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

**File paths**

In [None]:
customers_path = '/kaggle/input/dataset3/Customers.csv'
products_path = '/kaggle/input/dataset3/Products.csv'
transactions_path = '/kaggle/input/dataset3/Transactions.csv'

**Loading datasets**

In [None]:
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

**Preview datasets**

In [None]:
print("Customers Dataset:")
print(customers.head(), "\n")
print("Products Dataset:")
print(products.head(), "\n")
print("Transactions Dataset:")
print(transactions.head(), "\n")

**Checking for missing values**

In [None]:
print("Missing Values in Customers Dataset:")
print(customers.isnull().sum(), "\n")
print("Missing Values in Products Dataset:")
print(products.isnull().sum(), "\n")
print("Missing Values in Transactions Dataset:")
print(transactions.isnull().sum(), "\n")

**Checking data types**

In [None]:
print("Data Types in Customers Dataset:")
print(customers.dtypes, "\n")

print("Data Types in Products Dataset:")
print(products.dtypes, "\n")

print("Data Types in Transactions Dataset:")
print(transactions.dtypes, "\n")

**Converting dates to datetime format**

In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

**Summary statistics**

In [None]:
print("Summary Statistics for Transactions:")
print(transactions.describe(), "\n")

**Merging datasets**

In [None]:
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')



**Exploratory Data Analysis (EDA) 1. Customer demographics**


In [None]:
customer_region_counts = customers['Region'].value_counts()
plt.figure(figsize=(8, 5))
sns.barplot(x=customer_region_counts.index, y=customer_region_counts.values)
plt.title('Number of Customers by Region')
plt.xlabel('Region')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

**2. Product category distribution**

In [None]:
product_category_counts = products['Category'].value_counts()
plt.figure(figsize=(8, 5))
sns.barplot(x=product_category_counts.index, y=product_category_counts.values)
plt.title('Product Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

**3. Transaction trends over time**

In [None]:
merged_data['TransactionMonth'] = merged_data['TransactionDate'].dt.to_period('M')
transaction_trends = merged_data.groupby('TransactionMonth')['TotalValue'].sum()
plt.figure(figsize=(12, 6))
transaction_trends.plot(kind='line', marker='o')
plt.title('Transaction Trends Over Time')
plt.xlabel('Transaction Month')
plt.ylabel('Total Value ($)')
plt.grid()
plt.show()

4**. Top customers by total spending**

In [None]:
top_customers = merged_data.groupby('CustomerID')['TotalValue'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(8, 5))
sns.barplot(x=top_customers.values, y=top_customers.index, palette='viridis')
plt.title('Top 10 Customers by Total Spending')
plt.xlabel('Total Spending ($)')
plt.ylabel('Customer ID')
plt.show()

**5. Most sold products**

In [None]:
top_products = merged_data.groupby('ProductName')['Quantity'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(8, 5))
sns.barplot(x=top_products.values, y=top_products.index, palette='plasma')
plt.title('Top 10 Most Sold Products')
plt.xlabel('Total Quantity Sold')
plt.ylabel('Product Name')
plt.show()