In [5]:
import pandas as pd

data = {
    'Product': ['Apples', 'Bananas', 'Cherries', 'Dates', 'Elderberries'],
    'Price': [1.20, 0.50, 2.50, 3.00, 4.00],
    'Quantity': [10, 20, 15, 5, 8],
    'In_Stock': [True, True, False, True, False]
}

# Use pd.DataFrame with capital D and F
df = pd.DataFrame(data)

# to test that if it works
print(df.head())

        Product  Price  Quantity  In_Stock
0        Apples    1.2        10      True
1       Bananas    0.5        20      True
2      Cherries    2.5        15     False
3         Dates    3.0         5      True
4  Elderberries    4.0         8     False


In [6]:
# View top 3 rows
print(df.head(3))

    Product  Price  Quantity  In_Stock
0    Apples    1.2        10      True
1   Bananas    0.5        20      True
2  Cherries    2.5        15     False


In [7]:
# Check data types
print(df.dtypes)

Product      object
Price       float64
Quantity      int64
In_Stock       bool
dtype: object


In [8]:
# Method 1 (Bracket notation)-Recommended
print(df['Price'])

# Method 2 (Dot notation)
# print(df.Price)

0    1.2
1    0.5
2    2.5
3    3.0
4    4.0
Name: Price, dtype: float64


In [9]:
# We use a condition inside the brackets
filtered_df = df[df['Quantity'] > 10]
print(filtered_df)

    Product  Price  Quantity  In_Stock
1   Bananas    0.5        20      True
2  Cherries    2.5        15     False


In [10]:
# Pandas allows vectorization (multiplying whole columns at once)
df['Total_Value'] = df['Price'] * df['Quantity']
print(df)

        Product  Price  Quantity  In_Stock  Total_Value
0        Apples    1.2        10      True         12.0
1       Bananas    0.5        20      True         10.0
2      Cherries    2.5        15     False         37.5
3         Dates    3.0         5      True         15.0
4  Elderberries    4.0         8     False         32.0


In [11]:
average_price = df['Price'].mean()
print(average_price)

2.2399999999999998


In [12]:
# ascending=False makes it High -> Low
sorted_df = df.sort_values(by='Price', ascending=False)
print(sorted_df)

        Product  Price  Quantity  In_Stock  Total_Value
4  Elderberries    4.0         8     False         32.0
3         Dates    3.0         5      True         15.0
2      Cherries    2.5        15     False         37.5
0        Apples    1.2        10      True         12.0
1       Bananas    0.5        20      True         10.0


In [13]:
# inplace=True makes the change permanent in the original df
df.rename(columns={'Product': 'Item_Name'}, inplace=True)
print(df)

      Item_Name  Price  Quantity  In_Stock  Total_Value
0        Apples    1.2        10      True         12.0
1       Bananas    0.5        20      True         10.0
2      Cherries    2.5        15     False         37.5
3         Dates    3.0         5      True         15.0
4  Elderberries    4.0         8     False         32.0


In [33]:
import pandas as pd
import numpy as np

data = {
    'Customer': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Charlie'],
    'Region': ['North', 'North', 'South', 'North', 'South', np.nan], # Missing value here
    'Sales': [200, 150, 300, 400, 250, np.nan] # Missing value here
}

df = pd.DataFrame(data)
print(df)

  Customer Region  Sales
0    Alice  North  200.0
1      Bob  North  150.0
2  Charlie  South  300.0
3    Alice  North  400.0
4      Bob  South  250.0
5  Charlie    NaN    NaN


In [15]:
# 1. Check for nulls (returns True/False)
print(df.isnull())

# 2. Fill missing Sales with 0
df['Sales'] = df['Sales'].fillna(0) 
print(df)

   Customer  Region  Sales
0     False   False  False
1     False   False  False
2     False   False  False
3     False   False  False
4     False   False  False
5     False    True   True
  Customer Region  Sales
0    Alice  North  200.0
1      Bob  North  150.0
2  Charlie  South  300.0
3    Alice  North  400.0
4      Bob  South  250.0
5  Charlie    NaN    0.0


In [34]:
# This shows how much each customer sold in total
grouped_df = df.groupby('Customer')['Sales'].sum()
print(grouped_df)

Customer
Alice      600.0
Bob        400.0
Charlie    300.0
Name: Sales, dtype: float64


In [35]:
# We must access .str before calling string methods like .upper()
df['Region'] = df['Region'].str.upper()
print(df)

  Customer Region  Sales
0    Alice  NORTH  200.0
1      Bob  NORTH  150.0
2  Charlie  SOUTH  300.0
3    Alice  NORTH  400.0
4      Bob  SOUTH  250.0
5  Charlie    NaN    NaN


In [18]:
# Very useful for quick analysis of categories
counts = df['Region'].value_counts()
print(counts)

Region
NORTH    3
SOUTH    2
Name: count, dtype: int64


In [19]:
# index=False creates a cleaner file without the 0,1,2,3... row numbers
df.to_csv('sales_data.csv', index=False)
print("File saved!")

File saved!


In [20]:
import pandas as pd

# Table 1: Transactions
orders_data = {
    'Order_ID': [101, 102, 103, 104],
    'Date': ['2023-01-15', '2023-02-10', '2023-01-20', '2023-03-05'],
    'Customer_ID': [1, 2, 1, 3],
    'Amount': [500, 300, 450, 200]
}

# Table 2: Customer Info
customers_data = {
    'Customer_ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Country': ['USA', 'UK', 'Canada']
}

orders = pd.DataFrame(orders_data)
customers = pd.DataFrame(customers_data)

print("Orders Table:")
print(orders)
print("\nCustomers Table:")
print(customers)

Orders Table:
   Order_ID        Date  Customer_ID  Amount
0       101  2023-01-15            1     500
1       102  2023-02-10            2     300
2       103  2023-01-20            1     450
3       104  2023-03-05            3     200

Customers Table:
   Customer_ID     Name Country
0            1    Alice     USA
1            2      Bob      UK
2            3  Charlie  Canada


In [21]:
# pd.to_datetime is the magic function
orders['Date'] = pd.to_datetime(orders['Date'])

# Check the dtype - it should now say 'datetime64[ns]' instead of 'object'
print(orders.dtypes)

Order_ID                int64
Date           datetime64[ns]
Customer_ID             int64
Amount                  int64
dtype: object


In [22]:
# Similar to .str, we use .dt to access time properties
orders['Month'] = orders['Date'].dt.month
print(orders)

   Order_ID       Date  Customer_ID  Amount  Month
0       101 2023-01-15            1     500      1
1       102 2023-02-10            2     300      2
2       103 2023-01-20            1     450      1
3       104 2023-03-05            3     200      3


In [23]:
# 'on' specifies the common column key
merged_df = pd.merge(orders, customers, on='Customer_ID')
print(merged_df)

   Order_ID       Date  Customer_ID  Amount  Month     Name Country
0       101 2023-01-15            1     500      1    Alice     USA
1       102 2023-02-10            2     300      2      Bob      UK
2       103 2023-01-20            1     450      1    Alice     USA
3       104 2023-03-05            3     200      3  Charlie  Canada


In [24]:
# Step 1: Filter for USA
usa_orders = merged_df[merged_df['Country'] == 'USA']

# Step 2: Sum the Amount
total_usa = usa_orders['Amount'].sum()

print("Total USA Sales:", total_usa)

Total USA Sales: 950


In [25]:
import pandas as pd
import numpy as np

data = {
    'Date': ['2023-01-05', '2023-01-20', '2023-02-14', '2023-01-25', '2023-02-28'],
    'Product': ['Gold', 'Silver', 'Gold', 'Silver', 'Gold'],
    'Revenue': [1000, np.nan, 2000, 1500, 500] # Note the missing value!
}

df = pd.DataFrame(data)
print(df)

         Date Product  Revenue
0  2023-01-05    Gold   1000.0
1  2023-01-20  Silver      NaN
2  2023-02-14    Gold   2000.0
3  2023-01-25  Silver   1500.0
4  2023-02-28    Gold    500.0


In [27]:
# Step 1: Clean the missing data
df['Revenue'] = df['Revenue'].fillna(0)

# Step 2: Convert to DateTime
df['Date'] = pd.to_datetime(df['Date'])

# Step 3 & 4: Filter and Sum
# Logic: Product is Gold AND Month is 1 (January)
jan_gold_sales = df[
    (df['Product'] == 'Gold') & 
    (df['Date'].dt.month == 1)
]

total = jan_gold_sales['Revenue'].sum()

print("Total Gold Revenue in Jan:", total)

Total Gold Revenue in Jan: 1000.0


In [28]:
import pandas as pd

# creating the raw data
data = {
    'Date': ['2023-03-01', '2023-03-01', '2023-03-02', '2023-03-02', '2023-03-03', '2023-04-01', '2023-04-02', '2023-04-02'],
    'Item': ['Latte', 'Muffin', 'Espresso', 'Latte', 'Tea', 'Latte', 'Muffin', 'Espresso'],
    'Category': ['Coffee', 'Bakery', 'Coffee', 'Coffee', 'Tea', 'Coffee', 'Bakery', 'Coffee'],
    'Quantity': [2, 5, 1, 3, 2, 10, 2, 1],
    'Unit_Price': [4.50, 3.00, 3.00, 4.50, 2.50, 4.50, 3.00, 3.00]
}

# Saving it as a CSV file to mimic a real data source
df_raw = pd.DataFrame(data)
df_raw.to_csv('pybucks_sales.csv', index=False)

print("File 'pybucks_sales.csv' has been created successfully!")

File 'pybucks_sales.csv' has been created successfully!


In [29]:
# Load the data
df = pd.read_csv('pybucks_sales.csv')

# Q1: Create Total_Sale
df['Total_Sale'] = df['Quantity'] * df['Unit_Price']

print(df.head())

         Date      Item Category  Quantity  Unit_Price  Total_Sale
0  2023-03-01     Latte   Coffee         2         4.5         9.0
1  2023-03-01    Muffin   Bakery         5         3.0        15.0
2  2023-03-02  Espresso   Coffee         1         3.0         3.0
3  2023-03-02     Latte   Coffee         3         4.5        13.5
4  2023-03-03       Tea      Tea         2         2.5         5.0


In [30]:
# Group by Category and sum the Total_Sale
category_stats = df.groupby('Category')['Total_Sale'].sum()

print(category_stats)
# Answer: Coffee should be the highest

Category
Bakery    21.0
Coffee    73.5
Tea        5.0
Name: Total_Sale, dtype: float64


In [31]:
# Convert Date to datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# Filter for March (Month 3)
march_data = df[df['Date'].dt.month == 3]

# Sum the sales
march_total = march_data['Total_Sale'].sum()

print(f"Total March Sales: ${march_total}")

Total March Sales: $45.5
