In [None]:
import pandas as pd

# Create sample DataFrames for demonstration
print("=== SAMPLE DATA CREATION ===")

# Employee DataFrame
employees = pd.DataFrame(
    {
        "emp_id": ["E001", "E002", "E003", "E004", "E005"],
        "name": ["Alice", "Bob", "Carol", "David", "Eve"],
        "department": ["Engineering", "Marketing", "Engineering", "Sales", "Marketing"],
        "salary": [75000, 65000, 80000, 55000, 70000],
        "manager_id": ["M001", "M002", "M001", "M003", "M002"],
    }
)

# Manager DataFrame
managers = pd.DataFrame(
    {
        "manager_id": ["M001", "M002", "M003"],
        "manager_name": ["John", "Sarah", "Mike"],
        "department": ["Engineering", "Marketing", "Sales"],
    }
)

# Department DataFrame
departments = pd.DataFrame(
    {
        "dept_name": ["Engineering", "Marketing", "Sales", "HR"],
        "budget": [500000, 300000, 250000, 150000],
        "location": ["Building A", "Building B", "Building C", "Building D"],
    }
)

print("Employees DataFrame:")
print(employees)
print("\nManagers DataFrame:")
print(managers)
print("\nDepartments DataFrame:")
print(departments)

## 1. MERGING DataFrames

Merging combines DataFrames based on common columns (like SQL JOINs).


In [None]:
print("=== MERGING OPERATIONS ===")

# 1. INNER JOIN - Only matching records
print("1. INNER JOIN (employees + managers):")
inner_merge = pd.merge(employees, managers, on="manager_id", how="inner")
print(inner_merge)

# 2. LEFT JOIN - All employees, matching managers where available
print("\n2. LEFT JOIN (employees + managers):")
left_merge = pd.merge(employees, managers, on="manager_id", how="left")
print(left_merge)

# 3. RIGHT JOIN - All managers, matching employees where available
print("\n3. RIGHT JOIN (employees + managers):")
right_merge = pd.merge(employees, managers, on="manager_id", how="right")
print(right_merge)

# 4. OUTER JOIN - All records from both DataFrames
print("\n4. OUTER JOIN (employees + managers):")
outer_merge = pd.merge(employees, managers, on="manager_id", how="outer")
print(outer_merge)

# 5. Merge on different column names
print("\n5. MERGE on different column names (employees.department = departments.dept_name):")
dept_merge = pd.merge(employees, departments, left_on="department", right_on="dept_name", how="left")
print(dept_merge)

# 6. Multiple column merge
print("\n6. MERGE on multiple columns:")
multi_merge = pd.merge(employees, managers, on=["manager_id", "department"], how="inner")
print(multi_merge)

## 2. COMBINING DataFrames

Combining DataFrames by concatenating rows or columns.


In [None]:
print("=== COMBINING OPERATIONS ===")

# Create additional DataFrames for combining
new_employees = pd.DataFrame(
    {
        "emp_id": ["E006", "E007"],
        "name": ["Frank", "Grace"],
        "department": ["HR", "Engineering"],
        "salary": [60000, 85000],
        "manager_id": ["M004", "M001"],
    }
)

# 1. CONCATENATE ROWS (pd.concat with axis=0)
print("1. CONCATENATE ROWS (add new employees):")
combined_employees = pd.concat([employees, new_employees], axis=0, ignore_index=True)
print(combined_employees)

# 2. CONCATENATE COLUMNS (pd.concat with axis=1)
print("\n2. CONCATENATE COLUMNS:")
# Create additional info DataFrame
additional_info = pd.DataFrame(
    {
        "emp_id": ["E001", "E002", "E003", "E004", "E005"],
        "hire_date": ["2020-01-15", "2019-03-22", "2021-06-10", "2018-11-05", "2020-09-18"],
        "bonus": [5000, 3000, 6000, 2000, 4000],
    }
)

combined_cols = pd.concat([employees, additional_info], axis=1)
print(combined_cols)

# 3. APPEND (deprecated - use pd.concat instead)
print("\n3. APPEND (deprecated - use pd.concat instead):")
appended = pd.concat([employees, new_employees], axis=0, ignore_index=True)
print(appended)

# 4. JOIN DataFrames (different from merge)
print("\n4. JOIN DataFrames (index-based):")
employees_indexed = employees.set_index("emp_id")
managers_indexed = managers.set_index("manager_id")
joined = employees_indexed.join(managers_indexed, on="manager_id", how="left")
print(joined)

## 3. GROUPING DataFrames

Grouping allows you to perform operations on subsets of data.


In [None]:
print("=== GROUPING OPERATIONS ===")

# 1. BASIC GROUPBY with single aggregation
print("1. GROUP BY department - average salary:")
dept_avg_salary = employees.groupby("department")["salary"].mean()
print(dept_avg_salary)

# 2. GROUPBY with multiple aggregations
print("\n2. GROUP BY department - multiple statistics:")
dept_stats = employees.groupby("department")["salary"].agg(["count", "mean", "min", "max", "sum"])
print(dept_stats)

# 3. GROUPBY with custom aggregation functions
print("\n3. GROUP BY department - custom aggregations:")
dept_custom = employees.groupby("department").agg(
    {
        "salary": ["mean", "std", "count"],
        "name": "count",  # Count of employees
    }
)
print(dept_custom)

# 4. GROUPBY with multiple columns
print("\n4. GROUP BY department and manager_id:")
dept_manager = (
    employees.groupby(["department", "manager_id"])
    .agg({"salary": "sum", "name": "count"})
    .rename(columns={"name": "employee_count"})
)
print(dept_manager)

# 5. GROUPBY with transform (keep original shape)
print("\n5. GROUP BY with transform (add department average to each row):")
employees["dept_avg_salary"] = employees.groupby("department")["salary"].transform("mean")
print(employees)

# 6. GROUPBY with filter
print("\n6. GROUP BY with filter (departments with more than 1 employee):")
filtered_depts = employees.groupby("department").filter(lambda x: len(x) > 1)
print(filtered_depts)

# 7. GROUPBY with apply (custom function)
print("\n7. GROUP BY with apply (custom function):")


def top_earner(group):
    return group.nlargest(1, "salary")


top_earners = employees.groupby("department").apply(top_earner)
print(top_earners)

## 4. FILTERING DataFrames

Advanced filtering techniques beyond basic boolean indexing.


In [None]:
print("=== FILTERING OPERATIONS ===")

# 1. BASIC FILTERING (boolean indexing)
print("1. Basic filtering - high earners:")
high_earners = employees[employees["salary"] > 70000]
print(high_earners)

# 2. MULTIPLE CONDITIONS
print("\n2. Multiple conditions - Engineering high earners:")
eng_high = employees[(employees["department"] == "Engineering") & (employees["salary"] > 70000)]
print(eng_high)

# 3. ISIN() for multiple values
print("\n3. ISIN() - employees in specific departments:")
target_depts = employees[employees["department"].isin(["Engineering", "Marketing"])]
print(target_depts)

# 4. QUERY() method
print("\n4. QUERY() method - readable filtering:")
query_result = employees.query("salary > 70000 and department in ['Engineering', 'Marketing']")
print(query_result)

# 5. STRING FILTERING
print("\n5. String filtering - names starting with 'A':")
a_names = employees[employees["name"].str.startswith("A")]
print(a_names)

# 6. REGEX FILTERING
print("\n6. Regex filtering - names containing 'a' or 'e':")
regex_names = employees[employees["name"].str.contains("[ae]", regex=True)]
print(regex_names)

# 7. NULL/NAN FILTERING
print("\n7. Null filtering (if we had nulls):")
# Create DataFrame with nulls for demonstration
employees_with_nulls = employees.copy()
employees_with_nulls.loc[1, "salary"] = np.nan
print("Original with nulls:")
print(employees_with_nulls)
print("\nNon-null salaries:")
non_null = employees_with_nulls[employees_with_nulls["salary"].notna()]
print(non_null)

# 8. BETWEEN FILTERING
print("\n8. BETWEEN filtering - salary range:")
salary_range = employees[employees["salary"].between(60000, 80000)]
print(salary_range)

# 9. NTH LARGEST/SMALLEST
print("\n9. NTH largest - top 2 earners:")
top_2 = employees.nlargest(2, "salary")
print(top_2)

# 10. SAMPLE FILTERING
print("\n10. Random sampling - 3 random employees:")
sample = employees.sample(n=3, random_state=42)
print(sample)

## 5. PIVOTING and RESHAPING DataFrames

Transforming data structure for different analysis needs.


In [None]:
print("=== PIVOTING and RESHAPING OPERATIONS ===")

# Create sample data for pivoting
sales_data = pd.DataFrame(
    {
        "date": ["2023-01-01", "2023-01-01", "2023-01-02", "2023-01-02", "2023-01-03", "2023-01-03"],
        "product": ["A", "B", "A", "B", "A", "B"],
        "sales": [100, 150, 120, 180, 110, 160],
        "region": ["North", "South", "North", "South", "North", "South"],
    }
)

print("Original sales data:")
print(sales_data)

# 1. PIVOT TABLE
print("\n1. PIVOT TABLE - products as columns:")
pivot_sales = sales_data.pivot_table(index="date", columns="product", values="sales", aggfunc="sum")
print(pivot_sales)

# 2. PIVOT TABLE with multiple aggregations
print("\n2. PIVOT TABLE with multiple aggregations:")
pivot_multi = sales_data.pivot_table(index="region", columns="product", values="sales", aggfunc=["sum", "mean", "count"])
print(pivot_multi)

# 3. MELT (unpivot)
print("\n3. MELT (unpivot) - convert columns to rows:")
melted = pivot_sales.melt(id_vars=None, value_vars=["A", "B"], var_name="product", value_name="sales").reset_index()
print(melted)

# 4. STACK and UNSTACK
print("\n4. STACK and UNSTACK:")
# First create a multi-index DataFrame
multi_index_df = sales_data.set_index(["date", "product"])
print("Multi-index DataFrame:")
print(multi_index_df)

# Stack (convert columns to index)
stacked = multi_index_df.stack()
print("\nStacked:")
print(stacked)

# Unstack (convert index to columns)
unstacked = stacked.unstack()
print("\nUnstacked:")
print(unstacked)

# 5. WIDE to LONG format
print("\n5. WIDE to LONG format:")
wide_data = pd.DataFrame(
    {
        "id": [1, 2, 3],
        "name": ["Alice", "Bob", "Carol"],
        "math_score": [85, 90, 78],
        "english_score": [92, 88, 85],
        "science_score": [78, 95, 82],
    }
)
print("Wide format:")
print(wide_data)

long_data = pd.melt(
    wide_data,
    id_vars=["id", "name"],
    value_vars=["math_score", "english_score", "science_score"],
    var_name="subject",
    value_name="score",
)
print("\nLong format:")
print(long_data)

## 6. ADVANCED OPERATIONS

More sophisticated DataFrame manipulations.


In [None]:
print("=== ADVANCED OPERATIONS ===")

# 1. WINDOW FUNCTIONS (using rolling, expanding)
print("1. WINDOW FUNCTIONS - rolling averages:")
# Create time series data
dates = pd.date_range("2023-01-01", periods=10, freq="D")
ts_data = pd.DataFrame({"date": dates, "value": np.random.randint(50, 100, 10)})
print("Time series data:")
print(ts_data)

# Rolling window
ts_data["rolling_3"] = ts_data["value"].rolling(window=3).mean()
ts_data["rolling_5"] = ts_data["value"].rolling(window=5).mean()
print("\nWith rolling averages:")
print(ts_data)

# 2. RANKING
print("\n2. RANKING operations:")
employees["salary_rank"] = employees["salary"].rank(ascending=False)
employees["salary_percentile"] = employees["salary"].rank(pct=True)
print(employees[["name", "salary", "salary_rank", "salary_percentile"]])

# 3. CUT and Q-CUT (binning)
print("\n3. BINNING with cut and qcut:")
employees["salary_bins"] = pd.cut(employees["salary"], bins=3, labels=["Low", "Medium", "High"])
employees["salary_quartiles"] = pd.qcut(employees["salary"], q=4, labels=["Q1", "Q2", "Q3", "Q4"])
print(employees[["name", "salary", "salary_bins", "salary_quartiles"]])

# 4. CROSS TABULATION
print("\n4. CROSS TABULATION:")
# Create categorical data
employees["performance"] = ["High", "Medium", "High", "Low", "Medium"]
crosstab = pd.crosstab(employees["department"], employees["performance"], margins=True)
print(crosstab)

# 5. DUPLICATE HANDLING
print("\n5. DUPLICATE handling:")
# Create DataFrame with duplicates
duplicate_df = pd.concat([employees, employees.iloc[[0, 1]]], ignore_index=True)
print("DataFrame with duplicates:")
print(duplicate_df)

# Remove duplicates
no_duplicates = duplicate_df.drop_duplicates()
print("\nAfter removing duplicates:")
print(no_duplicates)

# Keep first/last occurrence
first_occurrence = duplicate_df.drop_duplicates(keep="first")
last_occurrence = duplicate_df.drop_duplicates(keep="last")
print(f"\nFirst occurrence: {len(first_occurrence)} rows")
print(f"Last occurrence: {len(last_occurrence)} rows")

# 6. CHAINING OPERATIONS
print("\n6. METHOD CHAINING:")
result = (
    employees.query("salary > 60000")
    .groupby("department")
    .agg({"salary": "mean", "name": "count"})
    .rename(columns={"name": "employee_count"})
    .sort_values("salary", ascending=False)
)
print("Chained operations result:")
print(result)

# 7. APPLY with custom functions
print("\n7. APPLY with custom functions:")


def categorize_salary(salary):
    if salary >= 80000:
        return "High"
    elif salary >= 60000:
        return "Medium"
    else:
        return "Low"


employees["salary_category"] = employees["salary"].apply(categorize_salary)
print(employees[["name", "salary", "salary_category"]])

# 8. VECTORIZED OPERATIONS
print("\n8. VECTORIZED operations:")
employees["bonus"] = employees["salary"] * 0.1  # 10% bonus
employees["total_comp"] = employees["salary"] + employees["bonus"]
print(employees[["name", "salary", "bonus", "total_comp"]])

## 7. PERFORMANCE TIPS and BEST PRACTICES

Optimization techniques for large DataFrames.


In [None]:
print("=== PERFORMANCE TIPS and BEST PRACTICES ===")

# 1. DATATYPE OPTIMIZATION
print("1. DATATYPE OPTIMIZATION:")
print("Original dtypes:")
print(employees.dtypes)

# Optimize dtypes
employees_optimized = employees.copy()
employees_optimized["emp_id"] = employees_optimized["emp_id"].astype("category")
employees_optimized["department"] = employees_optimized["department"].astype("category")
employees_optimized["manager_id"] = employees_optimized["manager_id"].astype("category")

print("\nOptimized dtypes:")
print(employees_optimized.dtypes)
print(f"Memory usage - Original: {employees.memory_usage(deep=True).sum()} bytes")
print(f"Memory usage - Optimized: {employees_optimized.memory_usage(deep=True).sum()} bytes")

# 2. EFFICIENT FILTERING
print("\n2. EFFICIENT FILTERING:")
# Good: Use vectorized operations
good_filter = employees[employees["salary"] > 70000]

# Better: Use query for complex conditions
better_filter = employees.query("salary > 70000 and department == 'Engineering'")

# Best: Chain operations efficiently
best_filter = employees.query("salary > 70000").query("department == 'Engineering'").sort_values("salary", ascending=False)

print("Filtered results:")
print(best_filter)

# 3. AVOID ITERROWS() - Use vectorized operations instead
print("\n3. VECTORIZED vs ITERROWS:")
# BAD: iterrows() (slow)
# for index, row in employees.iterrows():
#     employees.loc[index, 'bonus'] = row['salary'] * 0.1

# GOOD: Vectorized operations (fast)
employees["bonus_vectorized"] = employees["salary"] * 0.1
print("Vectorized bonus calculation:")
print(employees[["name", "salary", "bonus_vectorized"]])

# 4. USE LOC/ILOC EFFICIENTLY
print("\n4. EFFICIENT INDEXING:")
# Good: Use loc for label-based indexing
subset = employees.loc[employees["salary"] > 70000, ["name", "salary", "department"]]
print("Using loc:")
print(subset)

# 5. CHAINING vs INTERMEDIATE VARIABLES
print("\n5. METHOD CHAINING vs INTERMEDIATE VARIABLES:")
# Method chaining (memory efficient)
chained_result = employees.query("salary > 60000").groupby("department").agg({"salary": "mean"}).round(2)

# Intermediate variables (more readable but uses more memory)
filtered = employees.query("salary > 60000")
grouped = filtered.groupby("department")
intermediate_result = grouped.agg({"salary": "mean"}).round(2)

print("Chained result:")
print(chained_result)
print("\nIntermediate result:")
print(intermediate_result)

# 6. COPY vs VIEW
print("\n6. COPY vs VIEW:")
# View (no memory copy)
view = employees[["name", "salary"]]
print(f"View memory usage: {view.memory_usage(deep=True).sum()} bytes")

# Copy (memory copy)
copy = employees[["name", "salary"]].copy()
print(f"Copy memory usage: {copy.memory_usage(deep=True).sum()} bytes")

# 7. USEFUL METHODS FOR LARGE DATASETS
print("\n7. USEFUL METHODS FOR LARGE DATASETS:")
print("DataFrame info:")
print(f"Shape: {employees.shape}")
print(f"Memory usage: {employees.memory_usage(deep=True).sum()} bytes")
print(f"Columns: {list(employees.columns)}")
print(f"Index type: {type(employees.index)}")

# 8. QUERY OPTIMIZATION
print("\n8. QUERY OPTIMIZATION TIPS:")
print("✓ Use .query() for complex conditions")
print("✓ Use .isin() instead of multiple OR conditions")
print("✓ Use .between() for range queries")
print("✓ Use categorical dtypes for repeated string values")
print("✓ Use .loc[] and .iloc[] for specific row/column selection")
print("✓ Avoid .apply() when vectorized operations are available")
print("✓ Use method chaining to reduce intermediate variables")
print("✓ Consider using .eval() for complex expressions on large DataFrames")

In [None]:
person_data: dict[str, list] = {
    "name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "George", "Helen", "Ivy", "Jack"],
    "age": [25, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    "city": [
        "New York",
        "Los Angeles",
        "Chicago",
        "Houston",
        "Phoenix",
        "Philadelphia",
        "San Antonio",
        "San Diego",
        "Dallas",
        "San Jose",
    ],
}

df_person = pd.DataFrame(person_data)

df_person.head()

df_person.tail()

df_person.info()

df_person.describe()

print(df_person)

print(df_person.dtypes)

print(df_person.columns)

print(df_person.shape)

In [None]:
# 1.1.1 Creating DataFrame from LIST of dictionaries
print("🔹 Creating DataFrame from LIST of dictionaries:")
print("-" * 50)

# List of dictionaries - most common way
person_data_v2: list[dict[str, list]] = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "Los Angeles"},
    {"name": "Charlie", "age": 35, "city": "Chicago"},
    {"name": "David", "age": 40, "city": "Houston"},
    {"name": "Eve", "age": 45, "city": "Phoenix"},
    {"name": "Frank", "age": 50, "city": "Philadelphia"},
    {"name": "George", "age": 55, "city": "San Antonio"},
    {"name": "Helen", "age": 60, "city": "San Diego"},
    {"name": "Ivy", "age": 65, "city": "Dallas"},
    {"name": "Jack", "age": 70, "city": "San Jose"},
]

df_person_v2 = pd.DataFrame(person_data_v2)

print(df_person_v2)

print(df_person_v2.shape)

print(df_person_v2.dtypes)

print(df_person_v2.columns)

print(df_person_v2.head())

print(df_person_v2.tail())

print(df_person_v2.info())

print(df_person_v2.describe())

In [None]:
# Sample customer data using dictionary
customer_data = {
    "customer_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    "name": [
        "Alice Smith",
        "Bob Johnson",
        "Charlie Brown",
        "Diana Prince",
        "Eve Wilson",
        "Frank Miller",
        "Grace Lee",
        "Henry Davis",
    ],
    "age": [28, 35, 42, 29, 38, 45, 31, 52],
    "email": [
        "alice@email.com",
        "bob@email.com",
        "charlie@email.com",
        "diana@email.com",
        "eve@email.com",
        None,
        "grace@email.com",
        "henry@email.com",
    ],
    "total_purchases": [15, 8, 23, 5, 31, 12, 19, 7],
    "total_spent": [1250.50, 890.25, 3420.80, 567.00, 4890.15, 1678.90, 2345.60, 823.40],
    "member_since": [
        "2020-03-15",
        "2019-08-22",
        "2018-11-30",
        "2021-05-10",
        "2017-02-18",
        "2020-09-05",
        "2019-12-20",
        "2021-01-08",
    ],
}

df_customers = pd.DataFrame(customer_data)

result = df_customers.age > 30

print(result)

In [None]:
from IPython.display import HTML, display

selected_customers = df_customers.loc[[False, True, True, False, True, True, True, True]]

customers_over_30 = df_customers.loc[df_customers.age > 30]

customers_name_age_over_30 = df_customers.loc[df_customers.age > 30, ["name", "age"]]


display(HTML(customers_name_age_over_30.to_html()))


print(selected_customers)

print(customers_over_30)

print(customers_name_age_over_30)

df_customers.loc[df_customers.age > 30, "email"] = "Unknown"

print(df_customers)

In [None]:
data_v1 = {
    "name": [
        "Alice",
        "Bob",
        "Charlie",
        "Diana",
        "Eve",
        "Frank",
        "Grace",
        "Henry",
    ],
    "age": [
        28,
        35,
        42,
        29,
        38,
        45,
        31,
        52,
    ],
    "city": [
        "New York",
        "Los Angeles",
        "Chicago",
        "Houston",
        "Phoenix",
        "Philadelphia",
        "San Antonio",
        "San Diego",
    ],
    "email": [
        "alice@email.com",
        "bob@email.com",
        "charlie@email.com",
        "diana@email.com",
        "eve@email.com",
        "frank@email.com",
        "grace@email.com",
        "henry@email.com",
    ],
}

df_customers_v1 = pd.DataFrame(data_v1)

df_customers_v1_over_30 = df_customers_v1[df_customers_v1.age > 30]

print(df_customers_v1)

print()

print(df_customers_v1_over_30)

In [None]:
print("🔹 Creating DataFrame from DICTIONARY of lists:")
print("-" * 50)

# Dictionary of lists - efficient for large datasets
data_v2 = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "Los Angeles"},
    {"name": "Charlie", "age": 35, "city": "Chicago"},
    {"name": "Diana", "age": 40, "city": "Houston"},
    {"name": "Eve", "age": 45, "city": "Phoenix"},
    {"name": "Frank", "age": 50, "city": "Philadelphia"},
]

df_customers_v2 = pd.DataFrame(data_v2)

print(df_customers_v2)

df_customers_v2_over_30 = df_customers_v2[df_customers_v2.age > 30]

print()

print(df_customers_v2_over_30)

In [None]:
data_v3 = [
    [
        "Alice",
        25,
        "New York",
        "alice@email.com",
        15,
        1250.50,
    ],
    [
        "Bob",
        30,
        "Los Angeles",
        "bob@email.com",
        8,
        890.25,
    ],
    [
        "Charlie",
        35,
        "Chicago",
        "charlie@email.com",
        23,
        3420.80,
    ],
    [
        "Diana",
        40,
        "Houston",
        "diana@email.com",
        5,
        567.00,
    ],
]

df_customers_v3 = pd.DataFrame(data_v3, columns=["name", "age", "city", "email", "total_purchase", "total_spent"])

print(df_customers_v3)

df_customers_v3_over_30 = df_customers_v3[df_customers_v3.age > 30]

print()

print(df_customers_v3_over_30)

In [None]:
print("🔹 Creating DataFrame from TUPLE of tuples:")
print("-" * 50)

# Tuple of tuples - immutable data structure
data_v4 = (
    (
        "Alice",
        25,
        "New York",
        "alice@email.com",
        15,
        1250.50,
    ),
    (
        "Bob",
        30,
        "Los Angeles",
        "bob@email.com",
        8,
        890.25,
    ),
    (
        "Charlie",
        35,
        "Chicago",
        "charlie@email.com",
        23,
        3420.80,
    ),
    (
        "Diana",
        40,
        "Houston",
        "diana@email.com",
        5,
        567.00,
    ),
)

df_customers_v4 = pd.DataFrame(data_v4, columns=["name", "age", "city", "email", "total_purchase", "total_spent"])

print(df_customers_v4)

df_customers_v4_over_30 = df_customers_v4[df_customers_v4.age > 30]

print()

print(df_customers_v4_over_30)

In [None]:
import random

print("🔹 Creating DataFrame using SET operations:")
print("-" * 50)

# Using set comprehensions and operations
departments = {"Engineering", "Marketing", "Sales", "HR", "Finance"}
employees_per_dept = {dept: random.randint(5, 20) for dept in departments}
budget_per_dept = {dept: random.randint(10000, 50000) for dept in departments}

dept_data = []
for dept in departments:
    dept_data.append(
        {
            "department": dept,
            "employee_count": employees_per_dept[dept],
            "budget": budget_per_dept[dept],
            "budget_per_employee": budget_per_dept[dept] / employees_per_dept[dept],
        }
    )

df_from_set = pd.DataFrame(dept_data)

print(departments)

print("-" * 50)

print(df_from_set)

In [None]:
print("🔹 Adding and Modifying Columns:")
print("-" * 50)


# Tuple of tuples - immutable data structure
data_v5 = (
    ("John", "Doe", 32, "Engineer", 85000),
    ("Jane", "Smith", 28, "Designer", 72000),
    ("Mike", "Johnson", 35, "Manager", 95000),
    ("Sarah", "Wilson", 29, "Analyst", 68000),
    ("Tom", "Brown", 31, "Developer", 78000),
)

df_person_v3 = pd.DataFrame(data_v5, columns=["first_name", "last_name", "age", "job_title", "salary"])

# Add new columns
df_person_v3["full_name"] = df_person_v3["first_name"] + " " + df_person_v3["last_name"]
df_person_v3["salary_category"] = df_person_v3["salary"].apply(lambda x: "High" if x > 8000 else "Medium" if x > 7000 else "Low")
df_person_v3["years_until_retirement"] = 65 - df_person_v3["age"]

print(df_person_v3)

df_person_v3["salary"] = df_person_v3["salary"] * 1.1  # 10% raise

print("-" * 50)

print(df_person_v3[["full_name", "salary", "salary_category"]])

In [None]:
# 2.1.1 Advanced filtering with multiple conditions
print("🔹 Advanced Filtering Operations:")
print("-" * 50)


# Create a larger dataset for better filtering examples
np.random.seed(42)
n_employees = 20

# Generate employee data using different data structures
first_names = [
    "Alice",
    "Bob",
    "Charlie",
    "Diana",
    "Eve",
    "Frank",
    "Grace",
    "Henry",
    "Ivy",
    "Jack",
    "Kate",
    "Liam",
    "Maya",
    "Noah",
    "Olivia",
    "Paul",
    "Quinn",
    "Ruby",
    "Sam",
    "Tina",
]

departments = ["Engineering", "Marketing", "Sales", "HR", "Finance"]
cities = ["New York", "London", "Tokyo", "Paris", "Berlin", "Sydney"]

# Create comprehensive employee data
employee_data = []
for i in range(n_employees):
    employee_data.append(
        {
            "employee_id": f"EMP{i + 1:03d}",
            "name": first_names[i],
            "age": random.randint(22, 65),
            "department": random.choice(departments),
            "city": random.choice(cities),
            "salary": random.randint(40000, 120000),
            "years_experience": random.randint(0, 20),
            "performance_score": round(random.uniform(1.0, 5.0), 1),
            "is_manager": random.choice([True, False]),
        }
    )

df_employees = pd.DataFrame(employee_data)

print("Employee Dataset:")
print(df_employees.head(10))
print(f"Total employees: {len(df_employees)}")

In [None]:
# 2.1.2 Complex filtering with multiple conditions
print("🔹 Complex Filtering Examples:")
print("-" * 50)

# Filter 1: High performers in Engineering
high_performers_eng_v1 = df_employees[(df_employees["department"] == "Engineering") & (df_employees["performance_score"] >= 4.0)]
print("High performers in Engineering:")
print(high_performers_eng_v1[["name", "department", "performance_score", "salary"]])
print()

high_performers_eng_v2 = df_employees[
    df_employees["department"].isin(["Engineering", "Marketing"]) & (df_employees["performance_score"] >= 4.0)
]
print("High performers in Engineering and Marketing:")
print(high_performers_eng_v2[["name", "department", "performance_score", "salary"]])
print()

high_performers_eng_v3 = df_employees.query("department in ['Engineering', 'Marketing'] and performance_score >= 4.0")
print("High performers in Engineering and Marketing:")
print(high_performers_eng_v3[["name", "department", "performance_score", "salary"]])
print()

high_performers_eng_v4 = df_employees.loc[
    (df_employees["department"] == "Engineering") & (df_employees["performance_score"] >= 4.0)
]
print("High performers in Engineering:")
print(high_performers_eng_v4[["name", "department", "performance_score", "salary"]])
print()

hight_performers_eng_v4_1 = df_employees.loc[
    df_employees["department"].isin(["Engineering", "Marketing"]) & df_employees["performance_score"] >= 4.0
]
print("High performers in Engineering and Marketing:")
print(hight_performers_eng_v4_1[["name", "department", "performance_score", "salary"]])
print()

high_performers_eng_v5 = df_employees[
    df_employees.apply(lambda row: row["department"] in ["Engineering", "Marketing"] and row["performance_score"] >= 4.0, axis=1)
]
print("High performers in Engineering and Marketing:")
print(high_performers_eng_v5[["name", "department", "performance_score", "salary"]])
print()

high_performers_eng_v6 = df_employees.query("department == 'Engineering'").query("performance_score >= 4.0")
high_performers_eng_v7 = df_employees.query("department in ['Engineering', 'Marketing']").query("performance_score >= 4.0")

print("-" * 50)
print(high_performers_eng_v6)
print("-" * 50)
print(high_performers_eng_v7)


# Filter 2: