# Lab 1: Python for Data Science

**Day 1 - Foundations**

| Duration | Difficulty | Prerequisites |
|----------|------------|---------------|
| 60 min | Beginner | Basic Python |

## Learning Objectives

- Master NumPy array operations
- Learn Pandas DataFrame manipulation
- Create visualizations with Matplotlib
- Understand data preprocessing basics

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

---

## Exercise 1: NumPy Fundamentals

NumPy is the foundation of scientific computing in Python. All ML libraries use NumPy arrays.

**Your Task:** Complete the functions below to practice array operations.

In [None]:
def create_arrays():
    """Create different types of NumPy arrays."""
    # TODO: Create a 1D array with values [1, 2, 3, 4, 5]
    arr_1d = None
    
    # TODO: Create a 2D array (3x3) filled with zeros
    arr_zeros = None
    
    # TODO: Create a 2D array (3x3) filled with ones
    arr_ones = None
    
    # TODO: Create an array with values from 0 to 9
    arr_range = None
    
    # TODO: Create a 3x3 identity matrix
    arr_identity = None
    
    return arr_1d, arr_zeros, arr_ones, arr_range, arr_identity

In [None]:
def array_operations(arr):
    """Perform basic array operations."""
    # TODO: Calculate the mean of the array
    mean_val = None
    
    # TODO: Calculate the standard deviation
    std_val = None
    
    # TODO: Find the maximum value
    max_val = None
    
    # TODO: Find the index of the minimum value
    min_idx = None
    
    # TODO: Calculate the sum of all elements
    total = None
    
    return {
        'mean': mean_val,
        'std': std_val,
        'max': max_val,
        'min_index': min_idx,
        'sum': total
    }

In [None]:
# Test Exercise 1
arr_1d, arr_zeros, arr_ones, arr_range, arr_identity = create_arrays()
print("1D Array:", arr_1d)
print("Zeros:")
print(arr_zeros)
print("\nIdentity:")
print(arr_identity)

test_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
stats = array_operations(test_arr)
print("\nArray stats:", stats)

---

## Exercise 2: Array Manipulation

**Your Task:** Practice reshaping and slicing arrays.

In [None]:
def reshape_and_slice():
    """Practice array reshaping and slicing."""
    # Create a 1D array with 12 elements
    arr = np.arange(12)
    
    # TODO: Reshape to 3x4 matrix
    reshaped_3x4 = None
    
    # TODO: Reshape to 4x3 matrix
    reshaped_4x3 = None
    
    # TODO: Reshape to 2x2x3 (3D array)
    reshaped_3d = None
    
    return reshaped_3x4, reshaped_4x3, reshaped_3d

In [None]:
def array_slicing(arr):
    """Practice array slicing on a 2D array."""
    # arr is a 4x4 matrix
    
    # TODO: Get the first row
    first_row = None
    
    # TODO: Get the last column
    last_col = None
    
    # TODO: Get the top-left 2x2 submatrix
    top_left = None
    
    # TODO: Get every other element from the first row
    every_other = None
    
    return first_row, last_col, top_left, every_other

In [None]:
# Test Exercise 2
r1, r2, r3 = reshape_and_slice()
print("3x4 reshape:")
print(r1)

test_matrix = np.arange(16).reshape(4, 4)
print("\nTest matrix:")
print(test_matrix)

first_row, last_col, top_left, every_other = array_slicing(test_matrix)
print("\nFirst row:", first_row)
print("Last column:", last_col)
print("Top-left 2x2:")
print(top_left)

---

## Exercise 3: Pandas DataFrames

Pandas is essential for data manipulation. Let's practice with a sample dataset.

**Your Task:** Complete the data analysis functions.

In [None]:
# Create a sample dataset
np.random.seed(42)
n_samples = 100

data = {
    'age': np.random.randint(18, 65, n_samples),
    'income': np.random.normal(50000, 15000, n_samples).astype(int),
    'education_years': np.random.randint(8, 22, n_samples),
    'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston'], n_samples),
    'purchased': np.random.choice([0, 1], n_samples, p=[0.6, 0.4])
}

df = pd.DataFrame(data)
print("Dataset shape:", df.shape)
df.head()

In [None]:
def explore_dataframe(df):
    """Explore the DataFrame."""
    # TODO: Get basic statistics (use describe())
    stats = None
    
    # TODO: Get data types of each column (use dtypes)
    dtypes = None
    
    # TODO: Check for missing values (use isnull().sum())
    missing = None
    
    return stats, dtypes, missing

In [None]:
def filter_and_group(df):
    """Practice filtering and grouping."""
    # TODO: Filter rows where age > 30
    older_than_30 = None
    
    # TODO: Filter rows where income > 60000 AND purchased == 1
    high_income_buyers = None
    
    # TODO: Group by city and calculate mean income
    income_by_city = None
    
    # TODO: Group by city and count purchases
    purchases_by_city = None
    
    return older_than_30, high_income_buyers, income_by_city, purchases_by_city

In [None]:
# Test Exercise 3
stats, dtypes, missing = explore_dataframe(df)
print("Statistics:")
print(stats)

older, high_buyers, income_city, purch_city = filter_and_group(df)
print(f"\nPeople older than 30: {len(older) if older is not None else 'Not implemented'}")
print(f"\nIncome by city:")
print(income_city)

---

## Exercise 4: Data Visualization

**Your Task:** Create visualizations to understand your data.

In [None]:
def create_histogram(df, column):
    """Create a histogram for a numeric column."""
    # TODO: Create a histogram with 20 bins
    # Use plt.hist(), plt.xlabel(), plt.ylabel(), plt.title()
    pass

In [None]:
def create_scatter_plot(df, x_col, y_col, color_col=None):
    """Create a scatter plot."""
    # TODO: Create a scatter plot
    # If color_col is provided, color points by that column
    pass

In [None]:
def create_bar_chart(df, category_col, value_col):
    """Create a bar chart showing mean values by category."""
    # TODO: Group by category_col and plot mean of value_col
    pass

In [None]:
# Test Exercise 4
plt.figure(figsize=(15, 4))

plt.subplot(1, 3, 1)
create_histogram(df, 'income')

plt.subplot(1, 3, 2)
create_scatter_plot(df, 'age', 'income', 'purchased')

plt.subplot(1, 3, 3)
create_bar_chart(df, 'city', 'income')

plt.tight_layout()
plt.show()

---

## Exercise 5: Data Preprocessing

Preparing data for ML is crucial. Let's practice common preprocessing steps.

**Your Task:** Implement preprocessing functions.

In [None]:
def normalize_column(arr):
    """Normalize array to range [0, 1].
    
    Formula: (x - min) / (max - min)
    """
    # TODO: Implement min-max normalization
    pass

In [None]:
def standardize_column(arr):
    """Standardize array to mean=0, std=1.
    
    Formula: (x - mean) / std
    """
    # TODO: Implement z-score standardization
    pass

In [None]:
def one_hot_encode(df, column):
    """One-hot encode a categorical column.
    
    Hint: Use pd.get_dummies()
    """
    # TODO: Create one-hot encoded columns
    pass

In [None]:
# Test Exercise 5
test_data = np.array([10, 20, 30, 40, 50])

normalized = normalize_column(test_data)
print("Original:", test_data)
print("Normalized:", normalized)

standardized = standardize_column(test_data)
print("Standardized:", standardized)
if standardized is not None:
    print(f"  Mean: {standardized.mean():.4f}, Std: {standardized.std():.4f}")

encoded = one_hot_encode(df, 'city')
print("\nOne-hot encoded columns:")
if encoded is not None:
    print(encoded.head())

---

## Checkpoint

Congratulations! You've completed Lab 1.

### Key Takeaways:
- NumPy arrays are the foundation of ML in Python
- Pandas DataFrames make data manipulation easy
- Visualization helps understand your data
- Preprocessing (normalization, encoding) is essential for ML

**Next:** Lab 2 - Machine Learning Basics