In [30]:
#1. Load both CSV files into separate Pandas DataFrames.
import pandas as pd
import numpy as np

customers_df = pd.read_csv("customers_200_rows.csv")
sales_df = pd.read_csv("sales_200_rows.csv")

In [31]:
#2. Display the first 5 and last 5 rows of each DataFrame.
print(customers_df.head())
print(customers_df.tail())
print(sales_df.head())
print(sales_df.tail())

   customer_id                   name                        email  \
0         1001           Norma Fisher          ysullivan@yahoo.com   
1         1002           Susan Wagner  katelynmontgomery@yahoo.com   
2         1003  Dr. Stephanie Collins  thomas15@stewart-bowman.com   
3         1004           Joseph Brown    cortezraymond@garrett.com   
4         1005              Amy Stark         lindathomas@west.net   

                    country signup_date  
0                   Lesotho  2023-12-20  
1  United States of America  2024-09-16  
2                    Mexico  2024-06-22  
3                   Ecuador  2023-10-30  
4                 Venezuela  2024-07-11  
     customer_id             name                              email  \
195         1196  Robin Schroeder  robersonjulie@phillips-daniel.biz   
196         1197    Madison Hicks         williamsalexis@beasley.biz   
197         1198      Emily Weiss            vschneider@williams.com   
198         1199     Brandi Simon      

In [3]:
#3. Show the column names, data types, and check for null values in both datasets.
print(customers_df.dtypes)
print(customers_df.isnull().sum())
print(sales_df.dtypes)
print(sales_df.isnull().sum())

customer_id     int64
name           object
email          object
country        object
signup_date    object
dtype: object
customer_id    0
name           0
email          0
country        0
signup_date    0
dtype: int64
order_id            int64
customer_id         int64
order_date         object
product            object
category           object
quantity            int64
price_per_unit    float64
dtype: object
order_id          0
customer_id       0
order_date        0
product           0
category          0
quantity          0
price_per_unit    0
dtype: int64


In [5]:
#4. Convert the date columns ('signup_date' and 'order_date') to datetime objects.
customers_df['signup_date'] = pd.to_datetime(customers_df['signup_date'])
sales_df['order_date'] = pd.to_datetime(sales_df['order_date'])

In [9]:
'''5. Calculate the total revenue for each order (quantity * price_per_unit) and create a new column
'total_amount'.'''
sales_df['total_amount'] = sales_df['quantity'] * sales_df['price_per_unit']

In [12]:
#6. Merge the customers and sales datasets on 'customer_id'.
merged_df = pd.merge(sales_df, customers_df, on='customer_id', how='inner')

In [13]:
#7. Find the top 5 customers who spent the most overall.
top_customers = merged_df.groupby(['customer_id', 'name'])['total_amount'].sum().nlargest(5).reset_index()
top_customers

Unnamed: 0,customer_id,name,total_amount
0,1100,Nicholas Wright PhD,8003.79
1,1071,Gerald Garcia,7976.91
2,1081,Kevin Fuller,7442.95
3,1009,Joanne Keller,7379.88
4,1052,Michael Anderson,5644.95


In [15]:
#8. Count how many customers are from each country.
country_counts = customers_df['country'].value_counts().reset_index()
country_counts.columns = ['country', 'customer_count']
country_counts

Unnamed: 0,country,customer_count
0,Burkina Faso,4
1,Hungary,4
2,Zambia,4
3,Slovenia,4
4,Montenegro,3
...,...,...
127,Seychelles,1
128,Estonia,1
129,Australia,1
130,Samoa,1


In [17]:
#9. Calculate the average order value per customer.
avg_order_value = merged_df.groupby('customer_id')['total_amount'].mean().reset_index()
avg_order_value.columns = ['customer_id', 'average_order_value']
avg_order_value

Unnamed: 0,customer_id,average_order_value
0,1001,3159.960000
1,1002,1788.310000
2,1004,109.490000
3,1005,635.316667
4,1006,636.750000
...,...,...
81,1095,1497.450000
82,1096,441.826667
83,1097,59.485000
84,1099,49.990000


In [19]:
#10. Remove any duplicate records from both datasets.
customers_df = customers_df.drop_duplicates()
sales_df = sales_df.drop_duplicates()
sales_df = sales_df[(sales_df['quantity'] >= 0) & (sales_df['price_per_unit'] >= 0)]
sales_df

Unnamed: 0,order_id,customer_id,order_date,product,category,quantity,price_per_unit,total_amount
0,5001,1071,2023-09-19,Tablet,Electronics,4,399.00,1596.00
1,5002,1035,2022-10-01,Headphones,Accessories,1,89.99,89.99
2,5003,1093,2023-04-01,Webcam,Accessories,1,59.00,59.00
3,5004,1057,2023-07-12,Smartphone,Electronics,1,599.00,599.00
4,5005,1100,2023-03-13,Laptop,Electronics,2,789.99,1579.98
...,...,...,...,...,...,...,...,...
195,5196,1011,2022-05-06,Printer,Electronics,3,199.99,599.97
196,5197,1045,2022-12-11,Keyboard,Accessories,1,49.99,49.99
197,5198,1052,2022-12-05,Laptop,Electronics,4,789.99,3159.96
198,5199,1051,2023-08-02,Mouse,Accessories,2,19.99,39.98


In [29]:
#11. Identify and handle any missing or invalid data (e.g., negative quantity or price).
print("Missing values in sales_df:")
print(sales_df.isnull().sum())
print("Invalid (negative) quantities:")
print(sales_df[sales_df['quantity'] < 0])

print("\nInvalid (negative) prices:")
print(sales_df[sales_df['price_per_unit'] < 0])

Missing values in sales_df:
order_id          0
customer_id       0
order_date        0
product           0
category          0
quantity          0
price_per_unit    0
total_amount      0
dtype: int64
Invalid (negative) quantities:
Empty DataFrame
Columns: [order_id, customer_id, order_date, product, category, quantity, price_per_unit, total_amount]
Index: []

Invalid (negative) prices:
Empty DataFrame
Columns: [order_id, customer_id, order_date, product, category, quantity, price_per_unit, total_amount]
Index: []


In [22]:
#12. Group the merged data by category and find: - Total quantity sold per category - Total revenue per category
category_summary = merged_df.groupby('category').agg(
    total_quantity=('quantity', 'sum'),
    total_revenue=('total_amount', 'sum')
).reset_index()
category_summary


Unnamed: 0,category,total_quantity,total_revenue
0,Accessories,325,14257.55
1,Electronics,281,128645.31


In [23]:
#13. Create a new column that extracts the year and month from the 'order_date' and analyze monthly sales.
merged_df['year_month'] = merged_df['order_date'].dt.to_period('M')
monthly_sales = merged_df.groupby('year_month')['total_amount'].sum().reset_index()
monthly_sales

Unnamed: 0,year_month,total_amount
0,2022-01,4180.77
1,2022-02,8713.2
2,2022-03,5867.79
3,2022-04,4738.39
4,2022-05,7510.29
5,2022-06,5971.85
6,2022-07,7814.84
7,2022-08,11263.29
8,2022-09,5995.33
9,2022-10,5614.86


In [24]:
#14. Find customers who signed up in the last 6 months but haven't made any purchases.
recent_customers = customers_df[customers_df['signup_date'] >= pd.Timestamp.now() - pd.DateOffset(months=6)]
recent_no_purchases = recent_customers[~recent_customers['customer_id'].isin(sales_df['customer_id'])]
recent_no_purchases[['customer_id', 'name', 'signup_date']]


Unnamed: 0,customer_id,name,signup_date
31,1032,Bradley Robinson,2025-06-06
91,1092,Michael Bryant,2025-05-03
93,1094,Melanie Gomez,2025-02-08
105,1106,Carla Orozco,2025-05-30
129,1130,Frances Wilson,2025-05-26
131,1132,Traci Forbes,2025-03-08
151,1152,Michelle Nelson,2025-05-23
155,1156,Kelly Miller,2025-02-01
156,1157,Pamela Mcdonald,2025-05-06
158,1159,Amanda Freeman,2025-04-24


In [26]:
#15. Identify products that were sold less than 10 times in total (low performers).
product_sales = sales_df.groupby('product')['quantity'].sum().reset_index()
low_performers = product_sales[product_sales['quantity'] < 10]
low_performers

Unnamed: 0,product,quantity


In [27]:
#16. Create a summary report DataFrame with the following per customer:
customer_summary = merged_df.groupby(['customer_id', 'name']).agg(
    total_orders=('order_id', 'nunique'),
    total_items=('quantity', 'sum'),
    total_spent=('total_amount', 'sum'),
)
customer_summary['average_order_value'] = customer_summary['total_spent'] / customer_summary['total_orders']
customer_summary = customer_summary.reset_index()
customer_summary.head()

Unnamed: 0,customer_id,name,total_orders,total_items,total_spent,average_order_value
0,1001,Norma Fisher,1,4,3159.96,3159.96
1,1002,Susan Wagner,3,10,5364.93,1788.31
2,1004,Joseph Brown,3,10,328.47,109.49
3,1005,Amy Stark,3,9,1905.95,635.316667
4,1006,Juan Mann,2,6,1273.5,636.75


In [28]:
#17. Use NumPy to perform any custom operation (e.g., apply discount rule using vectorized operations).
merged_df['discounted_amount'] = np.where(merged_df['total_amount'] > 1000,
                                          merged_df['total_amount'] * 0.9,
                                          merged_df['total_amount'])
merged_df[['order_id', 'total_amount', 'discounted_amount']].head()

Unnamed: 0,order_id,total_amount,discounted_amount
0,5001,1596.0,1436.4
1,5002,89.99,89.99
2,5003,59.0,59.0
3,5004,599.0,599.0
4,5005,1579.98,1421.982
