In [None]:
# Python Basics - Getting Started

Welcome to your Python learning journey! This notebook covers fundamental Python concepts.

## 1. Variables and Data Types

In [1]:
# String
name = "Data Engineer"

# Integer
year = 2026

# Float
version = 3.11

# Boolean
is_learning = True

print(f"Hello, {name}!")
print(f"Year: {year}, Python version: {version}")
print(f"Currently learning: {is_learning}")

Hello, Data Engineer!
Year: 2026, Python version: 3.11
Currently learning: True


## 2. Lists and Dictionaries

In [2]:
# List - ordered, mutable collection
tools = ["Python", "SQL", "Spark", "Airflow"]
print(f"Tools: {tools}")
print(f"First tool: {tools[0]}")

# Dictionary - key-value pairs
data_engineer = {
    "name": "Santo",
    "role": "Data Engineer",
    "skills": ["Python", "SQL", "ETL"]
}
print(f"\nProfile: {data_engineer}")

Tools: ['Python', 'SQL', 'Spark', 'Airflow']
First tool: Python

Profile: {'name': 'Santo', 'role': 'Data Engineer', 'skills': ['Python', 'SQL', 'ETL']}


## 3. Control Flow

In [3]:
# For loop
print("Data Engineering Tools:")
for tool in tools:
    print(f"  - {tool}")

# Conditional
experience_years = 2
if experience_years < 1:
    level = "Junior"
elif experience_years < 3:
    level = "Mid"
else:
    level = "Senior"
    
print(f"\nWith {experience_years} years experience: {level} level")

Data Engineering Tools:
  - Python
  - SQL
  - Spark
  - Airflow

With 2 years experience: Mid level


## 4. Functions

In [None]:
def calculate_data_size(rows: int, columns: int, bytes_per_cell: int = 8) -> str:
    """Calculate approximate dataset size."""
    total_bytes = rows * columns * bytes_per_cell
    
    if total_bytes < 1024:
        return f"{total_bytes} bytes"
    elif total_bytes < 1024 ** 2:
        return f"{total_bytes / 1024:.2f} KB"
    elif total_bytes < 1024 ** 3:
        return f"{total_bytes / (1024 ** 2):.2f} MB"
    else:
        return f"{total_bytes / (1024 ** 3):.2f} GB"

# Test the function
print(calculate_data_size(1_000_000, 50))  # 1M rows, 50 columns

## 5. Working with Pandas (Data Engineering Essential)

In [1]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {
    'date': pd.date_range('2026-01-01', periods=5),
    'product': ['A', 'B', 'A', 'C', 'B'],
    'sales': [100, 150, 200, 80, 120],
    'quantity': [10, 15, 20, 8, 12]
}

df = pd.DataFrame(data)
print("Sample Sales Data:")
df

Sample Sales Data:


Unnamed: 0,date,product,sales,quantity
0,2026-01-01,A,100,10
1,2026-01-02,B,150,15
2,2026-01-03,A,200,20
3,2026-01-04,C,80,8
4,2026-01-05,B,120,12


In [2]:
# Basic DataFrame operations
print("DataFrame Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nTotal Sales: ${df['sales'].sum()}")
print(f"Average Sales: ${df['sales'].mean():.2f}")

# Group by product
print("\nSales by Product:")
df.groupby('product')['sales'].sum()

DataFrame Info:
Shape: (5, 4)
Columns: ['date', 'product', 'sales', 'quantity']

Total Sales: $650
Average Sales: $130.00

Sales by Product:


product
A    300
B    270
C     80
Name: sales, dtype: int64

## Next Steps

Continue your learning with:
- **02_data_structures.ipynb** - Advanced Python data structures
- **03_file_handling.ipynb** - Reading/writing CSV, JSON, Parquet
- **04_pandas_deep_dive.ipynb** - Advanced Pandas operations
