In [1]:
# Pandas Coding Exercises: From Basic to Advanced
# Using Various Python Data Types for Complete Understanding

import random
from time import time

import numpy as np
import pandas as pd

print("🚀 Welcome to Comprehensive Pandas Exercises!")
print("=" * 60)

🚀 Welcome to Comprehensive Pandas Exercises!


# 📚 LEVEL 1: BASIC PANDAS OPERATIONS

## Exercise 1.1: Creating DataFrames from Different Data Types

Let's start with the fundamentals - creating DataFrames from various Python data structures.


In [2]:
# 1.1.1 Creating DataFrame from LIST of dictionaries
print("🔹 Creating DataFrame from LIST of dictionaries:")
print("-" * 50)

# List of dictionaries - most common way
data_list = [
    {"name": "Alice", "age": 25, "city": "New York", "salary": 75000},
    {"name": "Bob", "age": 30, "city": "London", "salary": 82000},
    {"name": "Charlie", "age": 35, "city": "Tokyo", "salary": 68000},
    {"name": "Diana", "age": 28, "city": "Paris", "salary": 71000},
]

df_from_list = pd.DataFrame(data_list)
print("DataFrame from list of dicts:")
print(df_from_list)
print(f"Shape: {df_from_list.shape}")
print(f"Data types:\n{df_from_list.dtypes}")
print()

🔹 Creating DataFrame from LIST of dictionaries:
--------------------------------------------------
DataFrame from list of dicts:
      name  age      city  salary
0    Alice   25  New York   75000
1      Bob   30    London   82000
2  Charlie   35     Tokyo   68000
3    Diana   28     Paris   71000
Shape: (4, 4)
Data types:
name      object
age        int64
city      object
salary     int64
dtype: object



In [3]:
# 1.1.2 Creating DataFrame from DICTIONARY of lists
print("🔹 Creating DataFrame from DICTIONARY of lists:")
print("-" * 50)

# Dictionary of lists - efficient for large datasets
data_dict = {
    "product": ["Laptop", "Mouse", "Keyboard", "Monitor", "Headphones"],
    "price": [999.99, 29.99, 79.99, 299.99, 149.99],
    "category": ["Electronics", "Accessories", "Accessories", "Electronics", "Audio"],
    "in_stock": [True, True, False, True, True],
    "rating": [4.5, 4.2, 4.8, 4.3, 4.6],
}

df_from_dict = pd.DataFrame(data_dict)
print("DataFrame from dict of lists:")
print(df_from_dict)
print(f"Shape: {df_from_dict.shape}")
print(f"Data types:\n{df_from_dict.dtypes}")
print()

🔹 Creating DataFrame from DICTIONARY of lists:
--------------------------------------------------
DataFrame from dict of lists:
      product   price     category  in_stock  rating
0      Laptop  999.99  Electronics      True     4.5
1       Mouse   29.99  Accessories      True     4.2
2    Keyboard   79.99  Accessories     False     4.8
3     Monitor  299.99  Electronics      True     4.3
4  Headphones  149.99        Audio      True     4.6
Shape: (5, 5)
Data types:
product      object
price       float64
category     object
in_stock       bool
rating      float64
dtype: object



In [4]:
# 1.1.3 Creating DataFrame from TUPLE of tuples (2D data)
print("🔹 Creating DataFrame from TUPLE of tuples:")
print("-" * 50)

# Tuple of tuples - immutable data structure
data_tuple = (
    ("John", "Doe", 32, "Engineer", 85000),
    ("Jane", "Smith", 28, "Designer", 72000),
    ("Mike", "Johnson", 35, "Manager", 95000),
    ("Sarah", "Wilson", 29, "Analyst", 68000),
    ("Tom", "Brown", 31, "Developer", 78000),
)

columns = ["first_name", "last_name", "age", "job_title", "salary"]
df_from_tuple = pd.DataFrame(data_tuple, columns=columns)
print("DataFrame from tuple of tuples:")
print(df_from_tuple)
print(f"Shape: {df_from_tuple.shape}")
print()

🔹 Creating DataFrame from TUPLE of tuples:
--------------------------------------------------
DataFrame from tuple of tuples:
  first_name last_name  age  job_title  salary
0       John       Doe   32   Engineer   85000
1       Jane     Smith   28   Designer   72000
2       Mike   Johnson   35    Manager   95000
3      Sarah    Wilson   29    Analyst   68000
4        Tom     Brown   31  Developer   78000
Shape: (5, 5)



In [5]:
# 1.1.4 Creating DataFrame from SET operations and comprehensions
print("🔹 Creating DataFrame using SET operations:")
print("-" * 50)

# Using set comprehensions and operations
departments = {"Engineering", "Marketing", "Sales", "HR", "Finance"}
employees_per_dept = {dept: random.randint(5, 20) for dept in departments}
budget_per_dept = {dept: random.randint(100000, 500000) for dept in departments}

# Convert sets to DataFrames
dept_data = []
for dept in departments:
    dept_data.append(
        {
            "department": dept,
            "employee_count": employees_per_dept[dept],
            "budget": budget_per_dept[dept],
            "budget_per_employee": budget_per_dept[dept] / employees_per_dept[dept],
        }
    )

df_from_set = pd.DataFrame(dept_data)
print("DataFrame from set operations:")
print(df_from_set)
print(f"Shape: {df_from_set.shape}")
print()

🔹 Creating DataFrame using SET operations:
--------------------------------------------------
DataFrame from set operations:
    department  employee_count  budget  budget_per_employee
0           HR              15  272475         18165.000000
1  Engineering               8  281207         35150.875000
2    Marketing              11  294439         26767.181818
3        Sales               8  490094         61261.750000
4      Finance               7  494990         70712.857143
Shape: (5, 4)



## Exercise 1.2: Basic DataFrame Operations

Now let's practice fundamental DataFrame operations using our created data.


In [6]:
# 1.2.1 Basic DataFrame inspection and selection
print("🔹 Basic DataFrame Operations:")
print("-" * 50)

# Using our employee data
df = df_from_tuple.copy()
print("Original DataFrame:")
print(df)
print()

# Basic inspection methods
print("📊 DataFrame Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Index: {list(df.index)}")
print(f"Data types:\n{df.dtypes}")
print()

# Selection operations
print("📋 Selection Operations:")
print("First 3 rows:")
print(df.head(3))
print()

print("Last 2 rows:")
print(df.tail(2))
print()

print("Specific columns:")
print(df[["first_name", "salary"]])
print()

print("Specific rows and columns:")
print(df.loc[1:3, ["first_name", "job_title", "salary"]])
print()

🔹 Basic DataFrame Operations:
--------------------------------------------------
Original DataFrame:
  first_name last_name  age  job_title  salary
0       John       Doe   32   Engineer   85000
1       Jane     Smith   28   Designer   72000
2       Mike   Johnson   35    Manager   95000
3      Sarah    Wilson   29    Analyst   68000
4        Tom     Brown   31  Developer   78000

📊 DataFrame Info:
Shape: (5, 5)
Columns: ['first_name', 'last_name', 'age', 'job_title', 'salary']
Index: [0, 1, 2, 3, 4]
Data types:
first_name    object
last_name     object
age            int64
job_title     object
salary         int64
dtype: object

📋 Selection Operations:
First 3 rows:
  first_name last_name  age job_title  salary
0       John       Doe   32  Engineer   85000
1       Jane     Smith   28  Designer   72000
2       Mike   Johnson   35   Manager   95000

Last 2 rows:
  first_name last_name  age  job_title  salary
3      Sarah    Wilson   29    Analyst   68000
4        Tom     Brown   31  Dev

In [7]:
# 1.2.2 Adding and modifying columns
print("🔹 Adding and Modifying Columns:")
print("-" * 50)

# Add new columns
df["full_name"] = df["first_name"] + " " + df["last_name"]
df["salary_category"] = df["salary"].apply(lambda x: "High" if x > 80000 else "Medium" if x > 70000 else "Low")
df["years_until_retirement"] = 65 - df["age"]

print("DataFrame with new columns:")
print(df)
print()

# Modify existing columns
df["salary"] = df["salary"] * 1.1  # 10% raise
print("After 10% salary raise:")
print(df[["full_name", "salary", "salary_category"]])
print()

🔹 Adding and Modifying Columns:
--------------------------------------------------
DataFrame with new columns:
  first_name last_name  age  job_title  salary     full_name salary_category  \
0       John       Doe   32   Engineer   85000      John Doe            High   
1       Jane     Smith   28   Designer   72000    Jane Smith          Medium   
2       Mike   Johnson   35    Manager   95000  Mike Johnson            High   
3      Sarah    Wilson   29    Analyst   68000  Sarah Wilson             Low   
4        Tom     Brown   31  Developer   78000     Tom Brown          Medium   

   years_until_retirement  
0                      33  
1                      37  
2                      30  
3                      36  
4                      34  

After 10% salary raise:
      full_name    salary salary_category
0      John Doe   93500.0            High
1    Jane Smith   79200.0          Medium
2  Mike Johnson  104500.0            High
3  Sarah Wilson   74800.0             Low
4    

# 🎯 LEVEL 2: INTERMEDIATE PANDAS OPERATIONS

## Exercise 2.1: Data Filtering and Conditional Operations

Let's dive deeper into data manipulation using various filtering techniques.


In [8]:
# 2.1.1 Advanced filtering with multiple conditions
print("🔹 Advanced Filtering Operations:")
print("-" * 50)

# Create a larger dataset for better filtering examples
np.random.seed(42)
n_employees = 20

# Generate employee data using different data structures
first_names = [
    "Alice",
    "Bob",
    "Charlie",
    "Diana",
    "Eve",
    "Frank",
    "Grace",
    "Henry",
    "Ivy",
    "Jack",
    "Kate",
    "Liam",
    "Maya",
    "Noah",
    "Olivia",
    "Paul",
    "Quinn",
    "Ruby",
    "Sam",
    "Tina",
]

departments = ["Engineering", "Marketing", "Sales", "HR", "Finance"]
cities = ["New York", "London", "Tokyo", "Paris", "Berlin", "Sydney"]

# Create comprehensive employee data
employee_data = []
for i in range(n_employees):
    employee_data.append(
        {
            "employee_id": f"EMP{i + 1:03d}",
            "name": first_names[i],
            "age": random.randint(22, 65),
            "department": random.choice(departments),
            "city": random.choice(cities),
            "salary": random.randint(40000, 120000),
            "years_experience": random.randint(0, 20),
            "performance_score": round(random.uniform(1.0, 5.0), 1),
            "is_manager": random.choice([True, False]),
        }
    )

df_employees = pd.DataFrame(employee_data)
print("Employee Dataset:")
print(df_employees.head(10))
print(f"Total employees: {len(df_employees)}")
print()

🔹 Advanced Filtering Operations:
--------------------------------------------------
Employee Dataset:
  employee_id     name  age   department      city  salary  years_experience  \
0      EMP001    Alice   44        Sales     Tokyo   96901                 1   
1      EMP002      Bob   24           HR    Sydney  111275                 8   
2      EMP003  Charlie   61      Finance     Paris  102492                16   
3      EMP004    Diana   60        Sales    Berlin   86002                 3   
4      EMP005      Eve   46  Engineering     Tokyo   56586                11   
5      EMP006    Frank   42    Marketing    Berlin  113046                 5   
6      EMP007    Grace   30      Finance     Paris   79537                19   
7      EMP008    Henry   57           HR  New York  119664                16   
8      EMP009      Ivy   47    Marketing     Tokyo   67366                16   
9      EMP010     Jack   37  Engineering    London   50843                 1   

   performance_sc

In [9]:
# 2.1.2 Complex filtering with multiple conditions
print("🔹 Complex Filtering Examples:")
print("-" * 50)

# Filter 1: High performers in Engineering
high_performers_eng = df_employees[(df_employees["department"] == "Engineering") & (df_employees["performance_score"] >= 4.0)]
print("High performers in Engineering:")
print(high_performers_eng[["name", "department", "performance_score", "salary"]])
print()

# Filter 2: Senior employees (age > 40 OR experience > 10 years)
senior_employees = df_employees[(df_employees["age"] > 40) | (df_employees["years_experience"] > 10)]
print("Senior employees (age > 40 OR experience > 10 years):")
print(senior_employees[["name", "age", "years_experience", "department"]])
print()

# Filter 3: Using isin() with lists
target_cities = ["New York", "London", "Tokyo"]
employees_in_major_cities = df_employees[df_employees["city"].isin(target_cities)]
print("Employees in major cities:")
print(employees_in_major_cities[["name", "city", "department"]].head())
print()

# Filter 4: Using query() method for complex conditions
high_earners = df_employees.query("salary > 80000 and performance_score >= 3.5")
print("High earners with good performance:")
print(high_earners[["name", "salary", "performance_score", "department"]])
print()

🔹 Complex Filtering Examples:
--------------------------------------------------
High performers in Engineering:
Empty DataFrame
Columns: [name, department, performance_score, salary]
Index: []

Senior employees (age > 40 OR experience > 10 years):
       name  age  years_experience   department
0     Alice   44                 1        Sales
2   Charlie   61                16      Finance
3     Diana   60                 3        Sales
4       Eve   46                11  Engineering
5     Frank   42                 5    Marketing
6     Grace   30                19      Finance
7     Henry   57                16           HR
8       Ivy   47                16    Marketing
10     Kate   62                18      Finance
11     Liam   37                20        Sales
12     Maya   56                 5        Sales
13     Noah   22                12    Marketing
14   Olivia   51                18        Sales
15     Paul   61                16           HR
16    Quinn   23               

## Exercise 2.2: Grouping and Aggregation Operations

Master the art of grouping data and performing aggregations.


In [10]:
# 2.2.1 Basic grouping operations
print("🔹 Grouping and Aggregation Operations:")
print("-" * 50)

# Group by department and calculate statistics
dept_stats = (
    df_employees.groupby("department")
    .agg(
        {
            "salary": ["mean", "median", "std", "min", "max"],
            "age": ["mean", "min", "max"],
            "performance_score": ["mean", "count"],
            "is_manager": "sum",
        }
    )
    .round(2)
)

print("Department Statistics:")
print(dept_stats)
print()

# Flatten column names for better readability
dept_stats.columns = ["_".join(col).strip() for col in dept_stats.columns]
print("Flattened column names:")
print(dept_stats)
print()

🔹 Grouping and Aggregation Operations:
--------------------------------------------------
Department Statistics:
                salary                                       age          \
                  mean    median       std    min     max   mean min max   
department                                                                 
Engineering   53714.50   53714.5   4060.91  50843   56586  41.50  37  46   
Finance       81798.00   79537.0  15430.69  61865  102492  45.80  23  62   
HR           103821.67  111275.0  20606.06  80526  119664  47.33  24  61   
Marketing     75990.25   75044.5  30162.67  40826  113046  37.25  22  47   
Sales         75662.83   78660.0  26883.50  41482  109504  48.83  37  60   

            performance_score       is_manager  
                         mean count        sum  
department                                      
Engineering              2.45     2          0  
Finance                  3.60     5          4  
HR                       2.70    

In [11]:
# 2.2.2 Multi-level grouping and custom aggregations
print("🔹 Multi-level Grouping:")
print("-" * 50)

# Group by multiple columns
city_dept_stats = (
    df_employees.groupby(["city", "department"])
    .agg({"salary": "mean", "performance_score": "mean", "employee_id": "count"})
    .round(2)
)

city_dept_stats.columns = ["avg_salary", "avg_performance", "employee_count"]
print("Statistics by City and Department:")
print(city_dept_stats)
print()

# Custom aggregation functions


def salary_range(series):
    return series.max() - series.min()


def performance_grade(series):
    avg_score = series.mean()
    if avg_score >= 4.5:
        return "Excellent"
    elif avg_score >= 3.5:
        return "Good"
    elif avg_score >= 2.5:
        return "Average"
    else:
        return "Needs Improvement"


custom_stats = (
    df_employees.groupby("department")
    .agg(
        {
            "salary": [salary_range, "mean"],
            "performance_score": [performance_grade, "mean"],
        }
    )
    .round(2)
)

print("Custom Aggregations:")
print(custom_stats)
print()

🔹 Multi-level Grouping:
--------------------------------------------------
Statistics by City and Department:
                      avg_salary  avg_performance  employee_count
city     department                                              
Berlin   Marketing       76936.0             2.75               2
         Sales           78660.0             2.20               2
London   Engineering     50843.0             1.40               1
         Marketing       82723.0             2.20               1
         Sales          109504.0             4.50               1
New York Finance         68323.5             3.10               2
         HR             100095.0             2.20               2
         Sales           41482.0             2.90               1
Paris    Finance         91014.5             3.90               2
Sydney   HR             111275.0             3.70               1
Tokyo    Engineering     56586.0             3.50               1
         Finance         90314.0

## Exercise 2.3: Data Transformation with Different Data Types

Let's explore how to work with various Python data types within pandas operations.


In [12]:
# 2.3.1 Working with Lists in DataFrame columns
print("🔹 Working with Lists in DataFrame:")
print("-" * 50)

# Create a DataFrame with list columns
skills_data = {
    "employee_id": ["EMP001", "EMP002", "EMP003", "EMP004", "EMP005"],
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "programming_languages": [
        ["Python", "Java", "SQL"],
        ["JavaScript", "Python", "React"],
        ["Python", "R", "SQL", "Machine Learning"],
        ["Java", "Spring", "MySQL"],
        ["Python", "Django", "PostgreSQL", "Docker"],
    ],
    "certifications": [
        ["AWS", "Google Cloud"],
        ["React", "Node.js"],
        ["Data Science", "Machine Learning"],
        ["Java SE", "Spring Boot"],
        ["Python", "Docker", "Kubernetes"],
    ],
}

df_skills = pd.DataFrame(skills_data)
print("Skills DataFrame:")
print(df_skills)
print()

# Count skills per employee
df_skills["num_languages"] = df_skills["programming_languages"].apply(len)
df_skills["num_certifications"] = df_skills["certifications"].apply(len)
print("With skill counts:")
print(df_skills[["name", "num_languages", "num_certifications"]])
print()

# Check if employee has specific skills
df_skills["knows_python"] = df_skills["programming_languages"].apply(lambda x: "Python" in x)
df_skills["knows_java"] = df_skills["programming_languages"].apply(lambda x: "Java" in x)
print("Skill presence check:")
print(df_skills[["name", "knows_python", "knows_java"]])
print()

🔹 Working with Lists in DataFrame:
--------------------------------------------------
Skills DataFrame:
  employee_id     name                 programming_languages  \
0      EMP001    Alice                   [Python, Java, SQL]   
1      EMP002      Bob           [JavaScript, Python, React]   
2      EMP003  Charlie    [Python, R, SQL, Machine Learning]   
3      EMP004    Diana                 [Java, Spring, MySQL]   
4      EMP005      Eve  [Python, Django, PostgreSQL, Docker]   

                     certifications  
0               [AWS, Google Cloud]  
1                  [React, Node.js]  
2  [Data Science, Machine Learning]  
3            [Java SE, Spring Boot]  
4      [Python, Docker, Kubernetes]  

With skill counts:
      name  num_languages  num_certifications
0    Alice              3                   2
1      Bob              3                   2
2  Charlie              4                   2
3    Diana              3                   2
4      Eve              4        

In [13]:
# 2.3.2 Working with Dictionaries in DataFrame columns
print("🔹 Working with Dictionaries in DataFrame:")
print("-" * 50)

# Create DataFrame with dictionary columns
employee_details = {
    "employee_id": ["EMP001", "EMP002", "EMP003", "EMP004", "EMP005"],
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "contact_info": [
        {
            "email": "alice@company.com",
            "phone": "+1-555-0101",
            "address": "123 Main St",
        },
        {"email": "bob@company.com", "phone": "+1-555-0102", "address": "456 Oak Ave"},
        {
            "email": "charlie@company.com",
            "phone": "+1-555-0103",
            "address": "789 Pine Rd",
        },
        {"email": "diana@company.com", "phone": "+1-555-0104", "address": "321 Elm St"},
        {"email": "eve@company.com", "phone": "+1-555-0105", "address": "654 Maple Dr"},
    ],
    "performance_metrics": [
        {
            "projects_completed": 12,
            "client_satisfaction": 4.8,
            "team_collaboration": 4.5,
        },
        {
            "projects_completed": 8,
            "client_satisfaction": 4.2,
            "team_collaboration": 4.7,
        },
        {
            "projects_completed": 15,
            "client_satisfaction": 4.9,
            "team_collaboration": 4.3,
        },
        {
            "projects_completed": 10,
            "client_satisfaction": 4.6,
            "team_collaboration": 4.8,
        },
        {
            "projects_completed": 13,
            "client_satisfaction": 4.7,
            "team_collaboration": 4.4,
        },
    ],
}

df_details = pd.DataFrame(employee_details)
print("Employee Details DataFrame:")
print(df_details)
print()

# Extract specific values from dictionaries
df_details["email"] = df_details["contact_info"].apply(lambda x: x["email"])
df_details["phone"] = df_details["contact_info"].apply(lambda x: x["phone"])
df_details["projects_completed"] = df_details["performance_metrics"].apply(lambda x: x["projects_completed"])
df_details["client_satisfaction"] = df_details["performance_metrics"].apply(lambda x: x["client_satisfaction"])

print("Extracted information:")
print(df_details[["name", "email", "phone", "projects_completed", "client_satisfaction"]])
print()

🔹 Working with Dictionaries in DataFrame:
--------------------------------------------------
Employee Details DataFrame:
  employee_id     name                                       contact_info  \
0      EMP001    Alice  {'email': 'alice@company.com', 'phone': '+1-55...   
1      EMP002      Bob  {'email': 'bob@company.com', 'phone': '+1-555-...   
2      EMP003  Charlie  {'email': 'charlie@company.com', 'phone': '+1-...   
3      EMP004    Diana  {'email': 'diana@company.com', 'phone': '+1-55...   
4      EMP005      Eve  {'email': 'eve@company.com', 'phone': '+1-555-...   

                                 performance_metrics  
0  {'projects_completed': 12, 'client_satisfactio...  
1  {'projects_completed': 8, 'client_satisfaction...  
2  {'projects_completed': 15, 'client_satisfactio...  
3  {'projects_completed': 10, 'client_satisfactio...  
4  {'projects_completed': 13, 'client_satisfactio...  

Extracted information:
      name                email        phone  projects_complet

In [14]:
# 2.3.3 Working with Sets and Tuples
print("🔹 Working with Sets and Tuples:")
print("-" * 50)

# Create DataFrame with set and tuple columns
project_data = {
    "project_id": ["PROJ001", "PROJ002", "PROJ003", "PROJ004", "PROJ005"],
    "project_name": [
        "E-commerce Platform",
        "Mobile App",
        "Data Analytics",
        "AI Chatbot",
        "Cloud Migration",
    ],
    "technologies_used": [
        {"Python", "Django", "PostgreSQL", "Redis"},
        {"React Native", "JavaScript", "Firebase"},
        {"Python", "Pandas", "NumPy", "Matplotlib", "Jupyter"},
        {"Python", "TensorFlow", "NLTK", "Flask"},
        {"AWS", "Docker", "Kubernetes", "Terraform"},
    ],
    "team_members": [
        ("Alice", "Bob", "Charlie"),
        ("Diana", "Eve"),
        ("Frank", "Grace", "Henry", "Ivy"),
        ("Jack", "Kate"),
        ("Liam", "Maya", "Noah"),
    ],
    "project_duration": [
        (6, "months"),
        (4, "months"),
        (8, "months"),
        (5, "months"),
        (10, "months"),
    ],
}

df_projects = pd.DataFrame(project_data)
print("Projects DataFrame:")
print(df_projects)
print()

# Analyze technologies
df_projects["num_technologies"] = df_projects["technologies_used"].apply(len)
df_projects["num_team_members"] = df_projects["team_members"].apply(len)
df_projects["duration_months"] = df_projects["project_duration"].apply(lambda x: x[0])

print("Project Analysis:")
print(df_projects[["project_name", "num_technologies", "num_team_members", "duration_months"]])
print()

# Find common technologies across projects
all_technologies = set()
for tech_set in df_projects["technologies_used"]:
    all_technologies.update(tech_set)

print(f"All unique technologies used: {sorted(all_technologies)}")
print()

# Check which projects use Python
python_projects = df_projects[df_projects["technologies_used"].apply(lambda x: "Python" in x)]
print("Projects using Python:")
print(python_projects[["project_name", "technologies_used"]])
print()

🔹 Working with Sets and Tuples:
--------------------------------------------------
Projects DataFrame:
  project_id         project_name  \
0    PROJ001  E-commerce Platform   
1    PROJ002           Mobile App   
2    PROJ003       Data Analytics   
3    PROJ004           AI Chatbot   
4    PROJ005      Cloud Migration   

                              technologies_used                team_members  \
0           {Django, Python, PostgreSQL, Redis}       (Alice, Bob, Charlie)   
1          {Firebase, React Native, JavaScript}                (Diana, Eve)   
2  {NumPy, Matplotlib, Python, Pandas, Jupyter}  (Frank, Grace, Henry, Ivy)   
3             {NLTK, TensorFlow, Python, Flask}                (Jack, Kate)   
4          {Kubernetes, AWS, Docker, Terraform}          (Liam, Maya, Noah)   

  project_duration  
0      (6, months)  
1      (4, months)  
2      (8, months)  
3      (5, months)  
4     (10, months)  

Project Analysis:
          project_name  num_technologies  num_team_mem

# 🚀 LEVEL 3: ADVANCED PANDAS OPERATIONS

## Exercise 3.1: Advanced Data Manipulation and Performance Optimization

Let's explore advanced pandas features and optimization techniques.


In [15]:
# 3.1.1 Advanced Indexing and MultiIndex
print("🔹 Advanced Indexing and MultiIndex:")
print("-" * 50)

# Create a larger dataset for advanced operations
np.random.seed(42)
dates = pd.date_range("2023-01-01", periods=100, freq="D")
products = ["Laptop", "Mouse", "Keyboard", "Monitor", "Headphones"]
regions = ["North", "South", "East", "West"]

# Create sales data with MultiIndex
sales_data = []
for date in dates:
    for product in products:
        for region in regions:
            sales_data.append(
                {
                    "date": date,
                    "product": product,
                    "region": region,
                    "sales": random.randint(10, 100),
                    "price": random.uniform(50, 1000),
                    "quantity": random.randint(1, 20),
                }
            )

df_sales = pd.DataFrame(sales_data)
df_sales["revenue"] = df_sales["sales"] * df_sales["price"]

# Set MultiIndex
df_sales_multi = df_sales.set_index(["date", "product", "region"]).sort_index()
print("Sales DataFrame with MultiIndex:")
print(df_sales_multi.head(10))
print()

# Advanced indexing with MultiIndex
print("Sales for Laptop in North region:")
laptop_north = df_sales_multi.loc[("2023-01-01", "Laptop", "North") : ("2023-01-05", "Laptop", "North")]
print(laptop_north)
print()

# Cross-section operations
print("All Laptop sales across all regions on 2023-01-01:")
laptop_sales = df_sales_multi.xs(("2023-01-01", "Laptop"), level=["date", "product"])
print(laptop_sales)
print()

🔹 Advanced Indexing and MultiIndex:
--------------------------------------------------
Sales DataFrame with MultiIndex:
                              sales       price  quantity       revenue
date       product    region                                           
2023-01-01 Headphones East       24  280.804934        19   6739.318414
                      North      75  240.912321        18  18068.424045
                      South      64  987.705276         2  63213.137666
                      West       54  148.568788        19   8022.714539
           Keyboard   East       37  638.064548         5  23608.388261
                      North      84  739.087640         3  62083.361746
                      South      57  386.616074         5  22037.116228
                      West       87  529.419202         1  46059.470542
           Laptop     East       43  814.401850        16  35019.279556
                      North      72  694.820345        12  50027.064809

Sales for Lapto

In [16]:
# 3.1.2 Pivot Tables and Advanced Reshaping
print("🔹 Pivot Tables and Advanced Reshaping:")
print("-" * 50)

# Reset index for pivot operations
df_sales_pivot = df_sales.copy()

# Create pivot table
pivot_sales = df_sales_pivot.pivot_table(values="revenue", index="date", columns="product", aggfunc="sum", fill_value=0)
print("Revenue Pivot Table (Date vs Product):")
print(pivot_sales.head(10))
print()

# Multi-level pivot table
pivot_multi = df_sales_pivot.pivot_table(
    values=["revenue", "quantity"],
    index=["date", "region"],
    columns="product",
    aggfunc={"revenue": "sum", "quantity": "mean"},
    fill_value=0,
)
print("Multi-level Pivot Table:")
print(pivot_multi.head())
print()

# Melt operation (unpivot)
melted_sales = pivot_sales.reset_index().melt(id_vars="date", value_vars=products, var_name="product", value_name="revenue")
print("Melted DataFrame (first 10 rows):")
print(melted_sales.head(10))
print()

🔹 Pivot Tables and Advanced Reshaping:
--------------------------------------------------
Revenue Pivot Table (Date vs Product):
product        Headphones       Keyboard         Laptop        Monitor  \
date                                                                     
2023-01-01   96043.594664  153788.336777  161517.076492  171817.005216   
2023-01-02  114614.916598  169931.358316  220309.619458  157786.949811   
2023-01-03   82686.838747   31750.536241   70085.813250   72223.589261   
2023-01-04   75905.779011   86718.741533  117350.031910   99126.809574   
2023-01-05  143504.383075  110227.555443  124496.733037  140823.053892   
2023-01-06  177527.472571  207104.312580  116632.609279  118226.087375   
2023-01-07   94802.695172  154768.333324  103971.043304  100307.644881   
2023-01-08  155068.909009  140019.574994  129901.355209   90760.274920   
2023-01-09   97249.527610   87315.276409  115159.285305  139279.709158   
2023-01-10  108835.509184  131746.487846  100767.151469  

In [17]:
# 3.1.3 Advanced Window Functions and Rolling Operations
print("🔹 Window Functions and Rolling Operations:")
print("-" * 50)

# Create time series data
# Use the original df_sales for time series operations
df_ts = df_sales.copy()
df_ts = df_ts.sort_values("date")
df_ts = df_ts.sort_values("date")

# Rolling operations
# Create a simple time series from the sales data
df_ts["daily_revenue"] = df_ts.groupby("date")["revenue"].transform("sum")
df_ts["revenue_7day_avg"] = df_ts["daily_revenue"].rolling(window=7, min_periods=1).mean()
df_ts["revenue_30day_std"] = df_ts["daily_revenue"].rolling(window=30, min_periods=1).std()

print("Time Series with Rolling Statistics:")
print("Time Series with Rolling Statistics:")
print(df_ts[["date", "daily_revenue", "revenue_7day_avg", "revenue_30day_std"]].head(15))
print()

# Window functions with groupby
# Use the original sales data for grouped operations
df_sales_grouped = df_sales.copy()
df_sales_grouped["date"] = pd.to_datetime(df_sales_grouped["date"])
df_sales_grouped["month"] = df_sales_grouped["date"].dt.to_period("M")

# Calculate monthly rankings
monthly_rankings = df_sales_grouped.groupby("month").rank(ascending=False)
print("Monthly Product Rankings (by revenue):")
print(monthly_rankings.head(10))
print()

# Cumulative operations
df_sales_grouped["revenue_cumsum"] = df_sales_grouped["revenue"].cumsum()
df_sales_grouped["revenue_cummax"] = df_sales_grouped["revenue"].cummax()
df_sales_grouped["revenue_cumsum"] = df_sales_grouped["revenue"].cumsum()
df_sales_grouped["revenue_cummax"] = df_sales_grouped["revenue"].cummax()
print("Cumulative Operations:")
print("Cumulative Operations:")
print(df_sales_grouped[["date", "revenue", "revenue_cumsum", "revenue_cummax"]].head(10))
print()

🔹 Window Functions and Rolling Operations:
--------------------------------------------------
Time Series with Rolling Statistics:
Time Series with Rolling Statistics:
         date  daily_revenue  revenue_7day_avg  revenue_30day_std
0  2023-01-01  717147.748406     717147.748406                NaN
5  2023-01-01  717147.748406     717147.748406                0.0
7  2023-01-01  717147.748406     717147.748406                0.0
6  2023-01-01  717147.748406     717147.748406                0.0
1  2023-01-01  717147.748406     717147.748406                0.0
4  2023-01-01  717147.748406     717147.748406                0.0
2  2023-01-01  717147.748406     717147.748406                0.0
9  2023-01-01  717147.748406     717147.748406                0.0
8  2023-01-01  717147.748406     717147.748406                0.0
14 2023-01-01  717147.748406     717147.748406                0.0
3  2023-01-01  717147.748406     717147.748406                0.0
11 2023-01-01  717147.748406     717147.

## Exercise 3.2: Performance Optimization and Memory Management

Learn how to optimize pandas operations for better performance.


In [18]:
# 3.2.1 Memory optimization with categorical data types
print("🔹 Memory Optimization Techniques:")
print("-" * 50)

# Create a large dataset to demonstrate memory optimization
np.random.seed(42)
n_records = 10000

large_data = {
    "id": range(n_records),
    "category": np.random.choice(["A", "B", "C", "D", "E"], n_records),
    "status": np.random.choice(["Active", "Inactive", "Pending"], n_records),
    "region": np.random.choice(["North", "South", "East", "West"], n_records),
    "value": np.random.uniform(0, 1000, n_records),
}

df_large = pd.DataFrame(large_data)
print("Original DataFrame Memory Usage:")
print(f"Memory usage: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Data types:\n{df_large.dtypes}")
print()

# Convert to categorical for memory optimization
df_optimized = df_large.copy()
df_optimized["category"] = df_optimized["category"].astype("category")
df_optimized["status"] = df_optimized["status"].astype("category")
df_optimized["region"] = df_optimized["region"].astype("category")

print("Optimized DataFrame Memory Usage:")
print(f"Memory usage: {df_optimized.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Data types:\n{df_optimized.dtypes}")
print()

# Memory savings
original_memory = df_large.memory_usage(deep=True).sum()
optimized_memory = df_optimized.memory_usage(deep=True).sum()
savings = (original_memory - optimized_memory) / original_memory * 100
print(f"Memory savings: {savings:.1f}%")
print()

🔹 Memory Optimization Techniques:
--------------------------------------------------
Original DataFrame Memory Usage:
Memory usage: 1.67 MB
Data types:
id            int64
category     object
status       object
region       object
value       float64
dtype: object

Optimized DataFrame Memory Usage:
Memory usage: 0.18 MB
Data types:
id             int64
category    category
status      category
region      category
value        float64
dtype: object

Memory savings: 89.1%



In [19]:
# 3.2.2 Vectorized operations vs loops
print("🔹 Performance Comparison: Vectorized vs Loops:")
print("-" * 50)


# Create test data
n = 100000
df_perf = pd.DataFrame({"a": np.random.randn(n), "b": np.random.randn(n), "c": np.random.randn(n)})

# Method 1: Using loops (slow)
start_time = time()
result_loop = []
for i in range(len(df_perf)):
    result_loop.append(df_perf.iloc[i]["a"] * df_perf.iloc[i]["b"] + df_perf.iloc[i]["c"])
df_perf["result_loop"] = result_loop
loop_time = time() - start_time

# Method 2: Using vectorized operations (fast)
start_time = time()
df_perf["result_vectorized"] = df_perf["a"] * df_perf["b"] + df_perf["c"]
vectorized_time = time() - start_time

print(f"Loop method time: {loop_time:.4f} seconds")
print(f"Vectorized method time: {vectorized_time:.4f} seconds")
print(f"Speed improvement: {loop_time / vectorized_time:.1f}x faster")
print()

# Method 3: Using apply (moderate speed)
start_time = time()
df_perf["result_apply"] = df_perf.apply(lambda row: row["a"] * row["b"] + row["c"], axis=1)
apply_time = time() - start_time

print(f"Apply method time: {apply_time:.4f} seconds")
print(f"Vectorized vs Apply speed improvement: {apply_time / vectorized_time:.1f}x faster")
print()

# Verify results are the same
print("Results verification (first 5 rows):")
print(df_perf[["result_loop", "result_vectorized", "result_apply"]].head())
print(f"All results equal: {np.allclose(df_perf['result_loop'], df_perf['result_vectorized'])}")
print()

🔹 Performance Comparison: Vectorized vs Loops:
--------------------------------------------------
Loop method time: 1.6658 seconds
Vectorized method time: 0.0004 seconds
Speed improvement: 4176.4x faster

Apply method time: 0.2749 seconds
Vectorized vs Apply speed improvement: 689.3x faster

Results verification (first 5 rows):
   result_loop  result_vectorized  result_apply
0     0.864434           0.864434      0.864434
1     2.420560           2.420560      2.420560
2    -1.872403          -1.872403     -1.872403
3    -0.038637          -0.038637     -0.038637
4     0.024619           0.024619      0.024619
All results equal: True



# 🌟 LEVEL 4: REAL-WORLD SCENARIOS AND PRACTICAL EXAMPLES

## Exercise 4.1: E-commerce Analytics Dashboard

Let's build a comprehensive analytics solution using all the techniques we've learned.


In [20]:
# 4.1.1 Create comprehensive e-commerce dataset
print("🔹 E-commerce Analytics Dashboard:")
print("-" * 50)

# Set random seed for reproducibility
np.random.seed(42)

# Create comprehensive e-commerce data
n_orders = 5000
n_customers = 1000
n_products = 50

# Customer data
customers = {
    "customer_id": [f"CUST_{i:04d}" for i in range(1, n_customers + 1)],
    "name": [f"Customer_{i}" for i in range(1, n_customers + 1)],
    "age": np.random.randint(18, 80, n_customers),
    "city": np.random.choice(
        [
            "New York",
            "Los Angeles",
            "Chicago",
            "Houston",
            "Phoenix",
            "Philadelphia",
            "San Antonio",
            "San Diego",
            "Dallas",
            "San Jose",
        ],
        n_customers,
    ),
    "membership_tier": np.random.choice(["Bronze", "Silver", "Gold", "Platinum"], n_customers, p=[0.4, 0.3, 0.2, 0.1]),
    "registration_date": pd.date_range("2020-01-01", "2023-12-31", periods=n_customers),
}

# Product data
products = {
    "product_id": [f"PROD_{i:03d}" for i in range(1, n_products + 1)],
    "name": [f"Product_{i}" for i in range(1, n_products + 1)],
    "category": np.random.choice(
        [
            "Electronics",
            "Clothing",
            "Books",
            "Home & Garden",
            "Sports",
            "Beauty",
            "Toys",
            "Automotive",
        ],
        n_products,
    ),
    "price": np.random.uniform(10, 500, n_products),
    "cost": np.random.uniform(5, 250, n_products),
    "supplier": np.random.choice(["Supplier_A", "Supplier_B", "Supplier_C", "Supplier_D"], n_products),
}

# Order data
orders = {
    "order_id": [f"ORDER_{i:05d}" for i in range(1, n_orders + 1)],
    "customer_id": np.random.choice(customers["customer_id"], n_orders),
    "order_date": pd.date_range("2023-01-01", "2023-12-31", periods=n_orders),
    "product_id": np.random.choice(products["product_id"], n_orders),
    "quantity": np.random.randint(1, 10, n_orders),
    "discount_percent": np.random.uniform(0, 0.3, n_orders),
    "shipping_cost": np.random.uniform(5, 25, n_orders),
}

# Create DataFrames
df_customers = pd.DataFrame(customers)
df_products = pd.DataFrame(products)
df_orders = pd.DataFrame(orders)

print("Dataset Overview:")
print(f"Customers: {len(df_customers)}")
print(f"Products: {len(df_products)}")
print(f"Orders: {len(df_orders)}")
print()

# Display sample data
print("Sample Customer Data:")
print(df_customers.head())
print()

print("Sample Product Data:")
print(df_products.head())
print()

print("Sample Order Data:")
print(df_orders.head())
print()

🔹 E-commerce Analytics Dashboard:
--------------------------------------------------
Dataset Overview:
Customers: 1000
Products: 50
Orders: 5000

Sample Customer Data:
  customer_id        name  age          city membership_tier  \
0   CUST_0001  Customer_1   56       Phoenix          Bronze   
1   CUST_0002  Customer_2   69      San Jose            Gold   
2   CUST_0003  Customer_3   46        Dallas          Bronze   
3   CUST_0004  Customer_4   32  Philadelphia            Gold   
4   CUST_0005  Customer_5   60      New York          Silver   

              registration_date  
0 2020-01-01 00:00:00.000000000  
1 2020-01-02 11:04:30.270270270  
2 2020-01-03 22:09:00.540540540  
3 2020-01-05 09:13:30.810810810  
4 2020-01-06 20:18:01.081081081  

Sample Product Data:
  product_id       name    category       price        cost    supplier
0   PROD_001  Product_1  Automotive  346.429058  110.591379  Supplier_A
1   PROD_002  Product_2      Sports   66.098943   83.122370  Supplier_C
2   P

In [21]:
# 4.1.2 Advanced data joins and comprehensive analytics
print("🔹 Advanced Data Joins and Analytics:")
print("-" * 50)

# Join all datasets
df_complete = df_orders.merge(df_customers, on="customer_id", how="left")
df_complete = df_complete.merge(df_products, on="product_id", how="left")

# Calculate derived metrics
df_complete["order_value"] = df_complete["quantity"] * df_complete["price"]
df_complete["discount_amount"] = df_complete["order_value"] * df_complete["discount_percent"]
df_complete["final_order_value"] = df_complete["order_value"] - df_complete["discount_amount"]
df_complete["profit"] = df_complete["quantity"] * (df_complete["price"] - df_complete["cost"])
df_complete["total_cost"] = df_complete["final_order_value"] + df_complete["shipping_cost"]

# Add time-based features
df_complete["order_month"] = df_complete["order_date"].dt.to_period("M")
df_complete["order_quarter"] = df_complete["order_date"].dt.to_period("Q")
df_complete["order_weekday"] = df_complete["order_date"].dt.day_name()
df_complete["order_hour"] = np.random.randint(0, 24, len(df_complete))  # Simulated hour

print("Complete Dataset with Derived Metrics:")
print(
    df_complete[
        [
            "order_id",
            "customer_id",
            "product_id",
            "order_value",
            "final_order_value",
            "profit",
            "order_month",
        ]
    ].head()
)
print()

# Key Performance Indicators (KPIs)
print("📊 KEY PERFORMANCE INDICATORS:")
print("-" * 40)

# Revenue metrics
total_revenue = df_complete["final_order_value"].sum()
total_orders = len(df_complete)
avg_order_value = df_complete["final_order_value"].mean()
total_profit = df_complete["profit"].sum()

print(f"Total Revenue: ${total_revenue:,.2f}")
print(f"Total Orders: {total_orders:,}")
print(f"Average Order Value: ${avg_order_value:.2f}")
print(f"Total Profit: ${total_profit:,.2f}")
print(f"Profit Margin: {(total_profit / total_revenue) * 100:.1f}%")
print()

# Customer metrics
unique_customers = df_complete["customer_id"].nunique()
avg_orders_per_customer = total_orders / unique_customers

print(f"Unique Customers: {unique_customers:,}")
print(f"Average Orders per Customer: {avg_orders_per_customer:.1f}")
print()

🔹 Advanced Data Joins and Analytics:
--------------------------------------------------
Complete Dataset with Derived Metrics:
      order_id customer_id product_id  order_value  final_order_value  \
0  ORDER_00001   CUST_0139   PROD_011   367.675866         303.374547   
1  ORDER_00002   CUST_0996   PROD_015   936.703978         917.167647   
2  ORDER_00003   CUST_0435   PROD_003   990.405263         976.237572   
3  ORDER_00004   CUST_0509   PROD_005   190.835981         187.336648   
4  ORDER_00005   CUST_0273   PROD_023    69.073624          49.556007   

       profit order_month  
0  287.804735     2023-01  
1  880.530472     2023-01  
2  252.211799     2023-01  
3   38.572810     2023-01  
4  -84.342292     2023-01  

📊 KEY PERFORMANCE INDICATORS:
----------------------------------------
Total Revenue: $5,333,840.50
Total Orders: 5,000
Average Order Value: $1066.77
Total Profit: $3,356,987.28
Profit Margin: 62.9%

Unique Customers: 989
Average Orders per Customer: 5.1



In [22]:
# 4.1.3 Advanced analytics and insights
print("🔹 Advanced Analytics and Insights:")
print("-" * 50)

# Top performing products
top_products = (
    df_complete.groupby("product_id")
    .agg(
        {
            "final_order_value": "sum",  # Use final_order_value instead
            "quantity": "sum",
            "order_id": "count",
        }
    )
    .round(2)
)

top_products.columns = ["total_sales", "total_quantity", "order_count"]
top_products = top_products.sort_values("total_sales", ascending=False).head(10)

print("🏆 TOP 10 PRODUCTS BY SALES:")
print(top_products)
print()

# Customer segmentation analysis - FIXED: using 'city' instead of 'region'
customer_analysis = (
    df_complete.groupby("customer_id")
    .agg(
        {
            "final_order_value": ["sum", "mean", "count"],  # Changed to final_order_value
            "quantity": "sum",
            "city": "first",  # Changed from 'region' to 'city'
        }
    )
    .round(2)
)

customer_analysis.columns = [
    "total_spent",
    "avg_order_value",
    "order_count",
    "total_quantity",
    "city",  # Changed column name
]

customer_analysis = customer_analysis.sort_values("total_spent", ascending=False)


# Customer segments
def categorize_customer(row):
    if row["total_spent"] > customer_analysis["total_spent"].quantile(0.8):
        return "VIP"
    elif row["total_spent"] > customer_analysis["total_spent"].quantile(0.6):
        return "High Value"
    elif row["total_spent"] > customer_analysis["total_spent"].quantile(0.4):
        return "Medium Value"
    else:
        return "Low Value"


customer_analysis["segment"] = customer_analysis.apply(categorize_customer, axis=1)

print("👥 CUSTOMER SEGMENTATION:")
segment_summary = (
    customer_analysis.groupby("segment")
    .agg(
        {
            "total_spent": ["count", "mean"],
            "order_count": "mean",
            "total_quantity": "mean",
        }
    )
    .round(2)
)

segment_summary.columns = ["customer_count", "avg_spent", "avg_orders", "avg_quantity"]
print(segment_summary)
print()

# Monthly trends
monthly_trends = (
    df_complete.groupby("order_month")
    .agg(
        {
            "final_order_value": "sum",  # Use final_order_value
            "order_id": "count",
            "quantity": "sum",
        }
    )
    .round(2)
)

monthly_trends.columns = ["monthly_sales", "order_count", "total_quantity"]
print("📈 MONTHLY TRENDS:")
print(monthly_trends.tail(6))  # Last 6 months
print()

🔹 Advanced Analytics and Insights:
--------------------------------------------------
🏆 TOP 10 PRODUCTS BY SALES:
            total_sales  total_quantity  order_count
product_id                                          
PROD_012      224825.48             562          107
PROD_039      214674.49             592          113
PROD_024      201326.01             533          104
PROD_050      190731.01             485           99
PROD_019      186236.72             479          105
PROD_048      185739.00             520          101
PROD_049      185424.62             506          103
PROD_021      183411.20             503           95
PROD_037      171821.55             543          100
PROD_042      168174.20             583          115

👥 CUSTOMER SEGMENTATION:
              customer_count  avg_spent  avg_orders  avg_quantity
segment                                                          
High Value               198    6788.79        5.96         30.56
Low Value                3

## Exercise 4.2: Data Quality and Validation

Learn how to handle data quality issues and validate your datasets.


In [23]:
# 4.2.1 Data quality assessment and cleaning
print("🔹 Data Quality Assessment and Cleaning:")
print("-" * 50)

# Create a dataset with intentional quality issues
np.random.seed(42)
n_records = 1000

# Create data with various quality issues
quality_data = {
    "id": range(1, n_records + 1),
    "name": [f"Customer_{i}" if i % 10 != 0 else None for i in range(1, n_records + 1)],  # Missing names
    "email": [f"user{i}@email.com" if i % 15 != 0 else "invalid_email" for i in range(1, n_records + 1)],  # Invalid emails
    "age": [np.random.randint(18, 80) if i % 20 != 0 else -5 for i in range(1, n_records + 1)],  # Invalid ages
    "salary": [np.random.uniform(30000, 150000) if i % 25 != 0 else None for i in range(1, n_records + 1)],  # Missing salaries
    "phone": [f"+1-555-{i:04d}" if i % 30 != 0 else "invalid_phone" for i in range(1, n_records + 1)],  # Invalid phones
    "city": [
        (np.random.choice(["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"]) if i % 12 != 0 else "Unknown_City")
        for i in range(1, n_records + 1)
    ],  # Unknown cities
    "registration_date": [
        (pd.Timestamp("2023-01-01") + pd.Timedelta(days=np.random.randint(0, 365)) if i % 8 != 0 else pd.NaT)
        for i in range(1, n_records + 1)
    ],  # Missing dates
}

df_quality = pd.DataFrame(quality_data)
print("Dataset with Quality Issues:")
print(df_quality.head(10))
print()

# Data quality assessment
print("📊 DATA QUALITY ASSESSMENT:")
print("-" * 40)

# Missing values analysis
missing_data = df_quality.isnull().sum()
missing_percent = (missing_data / len(df_quality)) * 100

quality_report = pd.DataFrame({"Missing_Count": missing_data, "Missing_Percentage": missing_percent.round(2)})

print("Missing Values Analysis:")
print(quality_report[quality_report["Missing_Count"] > 0])
print()

# Data type analysis
print("Data Types:")
print(df_quality.dtypes)
print()

# Duplicate analysis
duplicates = df_quality.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
print()

# Outlier detection for numeric columns
numeric_cols = df_quality.select_dtypes(include=[np.number]).columns
print("Outlier Analysis (using IQR method):")
for col in numeric_cols:
    if col in df_quality.columns:
        Q1 = df_quality[col].quantile(0.25)
        Q3 = df_quality[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df_quality[(df_quality[col] < lower_bound) | (df_quality[col] > upper_bound)]
        print(f"{col}: {len(outliers)} outliers")
print()

🔹 Data Quality Assessment and Cleaning:
--------------------------------------------------
Dataset with Quality Issues:
   id        name             email  age         salary        phone  \
0   1  Customer_1   user1@email.com   56   52574.539167  +1-555-0001   
1   2  Customer_2   user2@email.com   69   85643.808593  +1-555-0002   
2   3  Customer_3   user3@email.com   46   72402.267363  +1-555-0003   
3   4  Customer_4   user4@email.com   32  100038.733422  +1-555-0004   
4   5  Customer_5   user5@email.com   60   39328.156436  +1-555-0005   
5   6  Customer_6   user6@email.com   25  146927.376920  +1-555-0006   
6   7  Customer_7   user7@email.com   78  148345.289338  +1-555-0007   
7   8  Customer_8   user8@email.com   38  113779.405682  +1-555-0008   
8   9  Customer_9   user9@email.com   56   94331.563961  +1-555-0009   
9  10        None  user10@email.com   75   67143.313954  +1-555-0010   

       city registration_date  
0  New York        2023-10-30  
1   Houston        2023

In [24]:
# 4.2.2 Data cleaning and validation
print("🔹 Data Cleaning and Validation:")
print("-" * 50)

# First, let's see what columns we actually have
print("Available columns in df_complete:")
print(df_complete.columns.tolist())
print()

# Create a cleaned version of the dataset
df_cleaned = df_complete.copy()

# 1. Handle missing values
print("1. Handling Missing Values:")
print(f"Before cleaning - Missing values:\n{df_cleaned.isnull().sum()[df_cleaned.isnull().sum() > 0]}")
print(f"Total missing values: {df_cleaned.isnull().sum().sum()}")
print()

# Fill missing values based on actual columns
if "name_x" in df_cleaned.columns:  # Customer name from merge
    df_cleaned["name_x"] = df_cleaned["name_x"].fillna("Unknown Customer")

if "name_y" in df_cleaned.columns:  # Product name from merge
    df_cleaned["name_y"] = df_cleaned["name_y"].fillna("Unknown Product")

# Fill missing numeric values with median
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    if df_cleaned[col].isnull().sum() > 0:
        median_value = df_cleaned[col].median()
        df_cleaned[col] = df_cleaned[col].fillna(median_value)

# Fill missing dates with a default date
date_columns = df_cleaned.select_dtypes(include=["datetime64"]).columns
for col in date_columns:
    if df_cleaned[col].isnull().sum() > 0:
        df_cleaned[col] = df_cleaned[col].fillna(pd.Timestamp("2023-01-01"))

print(f"After cleaning - Missing values: {df_cleaned.isnull().sum().sum()}")
print()

# 2. Validate numeric ranges
print("2. Numeric Data Validation:")

# Validate age (if it exists)
if "age" in df_cleaned.columns:
    invalid_ages = df_cleaned[(df_cleaned["age"] < 0) | (df_cleaned["age"] > 120)]
    print(f"Invalid ages found: {len(invalid_ages)}")

    if len(invalid_ages) > 0:
        median_age = df_cleaned[(df_cleaned["age"] >= 0) & (df_cleaned["age"] <= 120)]["age"].median()
        df_cleaned.loc[(df_cleaned["age"] < 0) | (df_cleaned["age"] > 120), "age"] = median_age
        print(f"After cleaning - Invalid ages: {len(df_cleaned[(df_cleaned['age'] < 0) | (df_cleaned['age'] > 120)])}")

# Validate prices (should be positive)
if "price" in df_cleaned.columns:
    invalid_prices = df_cleaned[df_cleaned["price"] < 0]
    print(f"Invalid prices found: {len(invalid_prices)}")

    if len(invalid_prices) > 0:
        df_cleaned.loc[df_cleaned["price"] < 0, "price"] = df_cleaned[df_cleaned["price"] >= 0]["price"].median()

# Validate quantities (should be positive)
if "quantity" in df_cleaned.columns:
    invalid_quantities = df_cleaned[df_cleaned["quantity"] <= 0]
    print(f"Invalid quantities found: {len(invalid_quantities)}")

    if len(invalid_quantities) > 0:
        df_cleaned.loc[df_cleaned["quantity"] <= 0, "quantity"] = 1

print()

# 3. Validate categorical data
print("3. Categorical Data Validation:")

# Validate membership_tier
if "membership_tier" in df_cleaned.columns:
    valid_tiers = ["Bronze", "Silver", "Gold", "Platinum"]
    invalid_tiers = df_cleaned[~df_cleaned["membership_tier"].isin(valid_tiers)]
    print(f"Invalid membership tiers found: {len(invalid_tiers)}")

    if len(invalid_tiers) > 0:
        df_cleaned.loc[~df_cleaned["membership_tier"].isin(valid_tiers), "membership_tier"] = "Bronze"

# Validate city data
if "city" in df_cleaned.columns:
    valid_cities = [
        "New York",
        "Los Angeles",
        "Chicago",
        "Houston",
        "Phoenix",
        "Philadelphia",
        "San Antonio",
        "San Diego",
        "Dallas",
        "San Jose",
    ]
    invalid_cities = df_cleaned[~df_cleaned["city"].isin(valid_cities)]
    print(f"Invalid cities found: {len(invalid_cities)}")

    if len(invalid_cities) > 0:
        df_cleaned.loc[~df_cleaned["city"].isin(valid_cities), "city"] = "Other"

# Validate category data
if "category" in df_cleaned.columns:
    valid_categories = [
        "Electronics",
        "Clothing",
        "Books",
        "Home & Garden",
        "Sports",
        "Beauty",
        "Toys",
        "Automotive",
    ]
    invalid_categories = df_cleaned[~df_cleaned["category"].isin(valid_categories)]
    print(f"Invalid categories found: {len(invalid_categories)}")

    if len(invalid_categories) > 0:
        df_cleaned.loc[~df_cleaned["category"].isin(valid_categories), "category"] = "Other"

print()

# 4. Remove duplicates
print("4. Duplicate Records:")
duplicates_before = df_cleaned.duplicated().sum()
print(f"Duplicate records found: {duplicates_before}")

if duplicates_before > 0:
    df_cleaned = df_cleaned.drop_duplicates()
    print(f"After cleaning - Duplicate records: {df_cleaned.duplicated().sum()}")

print()

# Final quality check
print("📊 FINAL QUALITY CHECK:")
print("-" * 50)
print(f"Total records: {len(df_cleaned)}")
print(f"Missing values: {df_cleaned.isnull().sum().sum()}")
print(f"Duplicate records: {df_cleaned.duplicated().sum()}")
print()

print("Data types summary:")
print(df_cleaned.dtypes.value_counts())
print()

print("Numeric columns summary:")
print(df_cleaned.select_dtypes(include=[np.number]).describe().round(2))
print()

print("Sample of cleaned data:")
print(df_cleaned.head(10))
print()

# Show which columns had issues
print("Columns processed:")
print(f"- Total columns: {len(df_cleaned.columns)}")
print(f"- Numeric columns: {len(df_cleaned.select_dtypes(include=[np.number]).columns)}")
print(f"- Categorical columns: {len(df_cleaned.select_dtypes(include=['object']).columns)}")
print(f"- DateTime columns: {len(df_cleaned.select_dtypes(include=['datetime64']).columns)}")

🔹 Data Cleaning and Validation:
--------------------------------------------------
Available columns in df_complete:
['order_id', 'customer_id', 'order_date', 'product_id', 'quantity', 'discount_percent', 'shipping_cost', 'name_x', 'age', 'city', 'membership_tier', 'registration_date', 'name_y', 'category', 'price', 'cost', 'supplier', 'order_value', 'discount_amount', 'final_order_value', 'profit', 'total_cost', 'order_month', 'order_quarter', 'order_weekday', 'order_hour']

1. Handling Missing Values:
Before cleaning - Missing values:
Series([], dtype: int64)
Total missing values: 0

After cleaning - Missing values: 0

2. Numeric Data Validation:
Invalid ages found: 0
Invalid prices found: 0
Invalid quantities found: 0

3. Categorical Data Validation:
Invalid membership tiers found: 0
Invalid cities found: 0
Invalid categories found: 0

4. Duplicate Records:
Duplicate records found: 0

📊 FINAL QUALITY CHECK:
--------------------------------------------------
Total records: 5000
Missi

## Exercise 4.3: Challenge Problems

Test your pandas skills with these challenging exercises!


In [25]:
# 4.3.1 Challenge 1: Complex Data Transformation
print("🔹 CHALLENGE 1: Complex Data Transformation")
print("-" * 50)

# Create a complex dataset with nested structures
challenge_data = {
    "transaction_id": [f"TXN_{i:05d}" for i in range(1, 1001)],
    "customer_id": [f"CUST_{np.random.randint(1, 101):03d}" for _ in range(1000)],
    "transaction_details": [
        {
            "products": [
                {
                    "product_id": f"PROD_{np.random.randint(1, 21):03d}",
                    "quantity": np.random.randint(1, 6),
                    "price": np.random.uniform(10, 200),
                }
                for _ in range(np.random.randint(1, 5))
            ],
            "payment_method": np.random.choice(["Credit Card", "Debit Card", "PayPal", "Cash"]),
            "discount_codes": [f"DISCOUNT_{i}" for i in np.random.choice(range(1, 11), np.random.randint(0, 3))],
        }
        for _ in range(1000)
    ],
    "timestamp": pd.date_range("2023-01-01", periods=1000, freq="H"),
}

df_challenge = pd.DataFrame(challenge_data)
print("Complex Dataset Structure:")
print(df_challenge.head(2))
print()

# Challenge: Flatten the nested transaction_details
print("SOLUTION: Flattening nested data...")

# Extract and flatten the nested data
flattened_data = []
for idx, row in df_challenge.iterrows():
    transaction_id = row["transaction_id"]
    customer_id = row["customer_id"]
    timestamp = row["timestamp"]

    for product in row["transaction_details"]["products"]:
        flattened_data.append(
            {
                "transaction_id": transaction_id,
                "customer_id": customer_id,
                "timestamp": timestamp,
                "product_id": product["product_id"],
                "quantity": product["quantity"],
                "price": product["price"],
                "payment_method": row["transaction_details"]["payment_method"],
                "discount_codes": row["transaction_details"]["discount_codes"],
            }
        )

df_flattened = pd.DataFrame(flattened_data)
print("Flattened Dataset:")
print(df_flattened.head(10))
print(f"Original records: {len(df_challenge)}")
print(f"Flattened records: {len(df_flattened)}")
print()

# Calculate total value per transaction
transaction_totals = (
    df_flattened.groupby("transaction_id")
    .agg(
        {
            "quantity": "sum",
            "price": lambda x: (x * df_flattened.loc[x.index, "quantity"]).sum(),
            "customer_id": "first",
            "timestamp": "first",
            "payment_method": "first",
        }
    )
    .round(2)
)

transaction_totals.columns = [
    "total_quantity",
    "total_value",
    "customer_id",
    "timestamp",
    "payment_method",
]
print("Transaction Totals:")
print(transaction_totals.head())
print()

🔹 CHALLENGE 1: Complex Data Transformation
--------------------------------------------------
Complex Dataset Structure:
  transaction_id customer_id  \
0      TXN_00001    CUST_064   
1      TXN_00002    CUST_059   

                                 transaction_details           timestamp  
0  {'products': [{'product_id': 'PROD_011', 'quan... 2023-01-01 00:00:00  
1  {'products': [{'product_id': 'PROD_014', 'quan... 2023-01-01 01:00:00  

SOLUTION: Flattening nested data...
Flattened Dataset:
  transaction_id customer_id           timestamp product_id  quantity  \
0      TXN_00001    CUST_064 2023-01-01 00:00:00   PROD_011         2   
1      TXN_00001    CUST_064 2023-01-01 00:00:00   PROD_005         3   
2      TXN_00001    CUST_064 2023-01-01 00:00:00   PROD_012         2   
3      TXN_00002    CUST_059 2023-01-01 01:00:00   PROD_014         3   
4      TXN_00002    CUST_059 2023-01-01 01:00:00   PROD_018         3   
5      TXN_00003    CUST_090 2023-01-01 02:00:00   PROD_009    

  "timestamp": pd.date_range("2023-01-01", periods=1000, freq="H"),


In [26]:
# 4.3.2 Challenge 2: Advanced Time Series Analysis
print("🔹 CHALLENGE 2: Advanced Time Series Analysis")
print("-" * 50)

# Create time series data with multiple patterns
np.random.seed(42)
dates = pd.date_range("2020-01-01", "2023-12-31", freq="D")
n_days = len(dates)

# Create multiple time series with different patterns
time_series_data = {
    "date": dates,
    "sales": 100 + 50 * np.sin(np.arange(n_days) * 2 * np.pi / 365) + np.random.normal(0, 10, n_days),
    "website_traffic": 1000 + 200 * np.sin(np.arange(n_days) * 2 * np.pi / 7) + np.random.normal(0, 50, n_days),
    "customer_satisfaction": 4.0 + 0.5 * np.sin(np.arange(n_days) * 2 * np.pi / 30) + np.random.normal(0, 0.2, n_days),
    "inventory_level": 500 - np.arange(n_days) * 0.1 + np.random.normal(0, 20, n_days),
    "marketing_spend": 1000 + 500 * np.sin(np.arange(n_days) * 2 * np.pi / 90) + np.random.normal(0, 100, n_days),
}

df_timeseries = pd.DataFrame(time_series_data)
df_timeseries.set_index("date", inplace=True)

print("Time Series Dataset:")
print(df_timeseries.head(10))
print()

# Challenge: Advanced time series analysis
print("SOLUTION: Advanced Time Series Analysis...")

# 1. Rolling statistics
df_timeseries["sales_7day_avg"] = df_timeseries["sales"].rolling(window=7).mean()
df_timeseries["sales_30day_std"] = df_timeseries["sales"].rolling(window=30).std()
df_timeseries["traffic_14day_trend"] = (
    df_timeseries["website_traffic"].rolling(window=14).apply(lambda x: np.polyfit(range(len(x)), x, 1)[0])
)

# 2. Seasonal decomposition (simplified)


def seasonal_decomposition(series, period=365):
    # Simple moving average for trend
    trend = series.rolling(window=period, center=True).mean()

    # Detrended series
    detrended = series - trend

    # Seasonal component (average of detrended values for each day of year)
    seasonal = detrended.groupby(detrended.index.dayofyear).mean()

    # Map seasonal component back to original index
    seasonal_component = detrended.index.map(lambda x: seasonal[x.dayofyear])

    # Residual
    residual = detrended - seasonal_component

    return trend, seasonal_component, residual


trend, seasonal, residual = seasonal_decomposition(df_timeseries["sales"])
df_timeseries["sales_trend"] = trend
df_timeseries["sales_seasonal"] = seasonal
df_timeseries["sales_residual"] = residual

# 3. Correlation analysis with lag


def cross_correlation(series1, series2, max_lag=30):
    correlations = []
    lags = range(-max_lag, max_lag + 1)

    for lag in lags:
        if lag == 0:
            corr = series1.corr(series2)
        elif lag > 0:
            corr = series1.shift(lag).corr(series2)
        else:
            corr = series1.corr(series2.shift(-lag))
        correlations.append(corr)

    return pd.Series(correlations, index=lags)


# Find optimal lag between marketing spend and sales
marketing_sales_corr = cross_correlation(df_timeseries["marketing_spend"], df_timeseries["sales"])
optimal_lag = marketing_sales_corr.idxmax()

print("Time Series Analysis Results:")
print(f"Optimal lag between marketing spend and sales: {optimal_lag} days")
print(f"Maximum correlation: {marketing_sales_corr.max():.3f}")
print()

# 4. Anomaly detection using Z-score


def detect_anomalies(series, threshold=3):
    z_scores = np.abs((series - series.mean()) / series.std())
    return z_scores > threshold


anomalies = detect_anomalies(df_timeseries["sales"])
print(f"Anomalies detected in sales: {anomalies.sum()}")
print()

# 5. Performance metrics
print("Performance Metrics:")
print(
    f"Sales growth rate (yearly): {((df_timeseries['sales'].iloc[-1] / df_timeseries['sales'].iloc[0]) ** (365 / len(df_timeseries)) - 1) * 100:.2f}%"
)
print(f"Average daily sales: {df_timeseries['sales'].mean():.2f}")
print(f"Sales volatility (CV): {(df_timeseries['sales'].std() / df_timeseries['sales'].mean()) * 100:.2f}%")
print()

print("Sample of enhanced time series data:")
print(df_timeseries[["sales", "sales_7day_avg", "sales_trend", "sales_seasonal"]].head(10))
print()

🔹 CHALLENGE 2: Advanced Time Series Analysis
--------------------------------------------------
Time Series Dataset:
                 sales  website_traffic  customer_satisfaction  \
date                                                             
2020-01-01  104.967142       938.785088               3.961061   
2020-01-02   99.478025      1145.915134               4.091118   
2020-01-03  108.197966      1152.459560               4.221719   
2020-01-04  117.811282      1057.750575               4.344341   
2020-01-05  101.098588       942.652172               4.348327   
2020-01-06  101.956870       888.509643               4.475955   
2020-01-07  120.947213       863.367280               4.791152   
2020-01-08  113.684750       940.205847               4.694351   
2020-01-09  102.169195      1178.596430               4.671092   
2020-01-10  113.141041      1254.817157               4.384420   

            inventory_level  marketing_spend  
date                                       

# 🎉 CONGRATULATIONS!

You've completed a comprehensive journey through pandas from basic to advanced level! 

## What You've Learned:

### 📚 **Level 1: Basic Operations**
- Creating DataFrames from different data types (lists, dictionaries, tuples, sets)
- Basic DataFrame operations and inspection
- Column manipulation and data selection

### 🎯 **Level 2: Intermediate Operations**
- Advanced filtering and conditional operations
- Grouping and aggregation techniques
- Working with complex data types (lists, dictionaries, sets, tuples in columns)
- Multi-level grouping and custom aggregations

### 🚀 **Level 3: Advanced Operations**
- MultiIndex and advanced indexing
- Pivot tables and data reshaping
- Window functions and rolling operations
- Performance optimization and memory management
- Vectorized operations vs loops

### 🌟 **Level 4: Real-World Applications**
- E-commerce analytics dashboard
- Data quality assessment and cleaning
- Complex data transformations
- Advanced time series analysis
- Challenge problems with nested data structures

## Key Takeaways:

1. **Data Types Mastery**: You now understand how to work with all Python data types within pandas
2. **Performance Optimization**: You know how to write efficient pandas code
3. **Real-World Skills**: You can handle complex, messy datasets like those in production
4. **Advanced Analytics**: You can perform sophisticated data analysis and transformations

## Next Steps:

- Practice with your own datasets
- Explore pandas documentation for additional features
- Learn about pandas integration with other libraries (matplotlib, seaborn, scikit-learn)
- Try working with larger datasets to practice performance optimization

**Happy coding with pandas! 🐼✨**
