## 📊 Introduction to Pandas and DataFrames

In [2]:
# ✅ What is Pandas?
# Pandas is a powerful Python library used for data manipulation and analysis.
# It provides two key structures: Series (1D) and DataFrame (2D like a table).
# 💡 Use DataFrames when working with CSV files, Excel data, APIs, or tabular data in general.

import pandas as pd
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Paris', 'London']
}

# Create a DataFrame
df = pd.DataFrame(data)
print(df)

      Name  Age      City
0    Alice   25  New York
1      Bob   30     Paris
2  Charlie   35    London


In [3]:
# 📌 Accessing columns and rows
print(df['Name'])           # Access a column
print(df.loc[0])            # Access a row by index
print(df.iloc[1])           # Access a row by position


0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
Name      Bob
Age        30
City    Paris
Name: 1, dtype: object


In [4]:
# 🔄 Common DataFrame operations
print(df.shape)             # (rows, columns)
print(df.columns)           # Column names
print(df.describe())        # Summary stats
print(df[df['Age'] > 28])   # Filtering rows

(3, 3)
Index(['Name', 'Age', 'City'], dtype='object')
        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0
      Name  Age    City
1      Bob   30   Paris
2  Charlie   35  London


In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv("resources/amazon_sales_data_2025.csv")

# Basic Info
print("📌 Basic Info:")
print(df.info())
print()


📌 Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Order ID           250 non-null    object
 1   Date               250 non-null    object
 2   Product            250 non-null    object
 3   Category           250 non-null    object
 4   Price              250 non-null    int64 
 5   Quantity           250 non-null    int64 
 6   Total Sales        250 non-null    int64 
 7   Customer Name      250 non-null    object
 8   Customer Location  250 non-null    object
 9   Payment Method     250 non-null    object
 10  Status             250 non-null    object
dtypes: int64(3), object(8)
memory usage: 21.6+ KB
None



In [7]:

# First few rows
print("📊 First few rows:")
print(df.head())
print()



📊 First few rows:
  Order ID      Date        Product     Category  Price  Quantity  \
0  ORD0001  14-03-25  Running Shoes     Footwear     60         3   
1  ORD0002  20-03-25     Headphones  Electronics    100         4   
2  ORD0003  15-02-25  Running Shoes     Footwear     60         2   
3  ORD0004  19-02-25  Running Shoes     Footwear     60         3   
4  ORD0005  10-03-25     Smartwatch  Electronics    150         3   

   Total Sales  Customer Name Customer Location Payment Method     Status  
0          180     Emma Clark          New York     Debit Card  Cancelled  
1          400  Emily Johnson     San Francisco     Debit Card    Pending  
2          120       John Doe            Denver     Amazon Pay  Cancelled  
3          180  Olivia Wilson            Dallas    Credit Card    Pending  
4          450     Emma Clark          New York     Debit Card    Pending  



In [8]:

# Summary stats for numeric columns
print("📈 Summary Statistics:")
print(df.describe())
print()



📈 Summary Statistics:
             Price    Quantity  Total Sales
count   250.000000  250.000000   250.000000
mean    343.580000    2.856000   975.380000
std     380.635808    1.429489  1252.112254
min      15.000000    1.000000    15.000000
25%      40.000000    2.000000   100.000000
50%     150.000000    3.000000   400.000000
75%     600.000000    4.000000  1500.000000
max    1200.000000    5.000000  6000.000000



In [9]:
# Column names
print("🧾 Columns:")
print(df.columns.tolist())
print()


🧾 Columns:
['Order ID', 'Date', 'Product', 'Category', 'Price', 'Quantity', 'Total Sales', 'Customer Name', 'Customer Location', 'Payment Method', 'Status']



In [None]:

# Count of unique products
print("📦 Unique Products:", df['Product'].nunique())

📦 Unique Products: 10


In [None]:
# Orders by category
print("🧪 Sales by Category:")
print(df.groupby("Category")["Total Sales"].sum())
print()

🧪 Sales by Category:
Category
Books                1035
Clothing             3540
Electronics        129950
Footwear             4320
Home Appliances    105000
Name: Total Sales, dtype: int64



In [12]:
# Status count
print("📋 Order Status Breakdown:")
print(df["Status"].value_counts())
print()

📋 Order Status Breakdown:
Status
Completed    88
Pending      85
Cancelled    77
Name: count, dtype: int64



In [13]:
# Top 5 customers by total purchase
print("🏆 Top Customers:")
print(df.groupby("Customer Name")["Total Sales"].sum().sort_values(ascending=False).head())

🏆 Top Customers:
Customer Name
Olivia Wilson    36170
Jane Smith       31185
Emma Clark       29700
John Doe         26870
Emily Johnson    23475
Name: Total Sales, dtype: int64
