# Lab 1: Python for Data Science - SOLUTIONS

**Day 1 - Foundations**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(42)

## Exercise 1: NumPy Fundamentals - SOLUTION

In [None]:
def create_arrays():
    """Create different types of NumPy arrays."""
    arr_1d = np.array([1, 2, 3, 4, 5])
    arr_zeros = np.zeros((3, 3))
    arr_ones = np.ones((3, 3))
    arr_range = np.arange(10)
    arr_identity = np.eye(3)
    
    return arr_1d, arr_zeros, arr_ones, arr_range, arr_identity

def array_operations(arr):
    """Perform basic array operations."""
    return {
        'mean': np.mean(arr),
        'std': np.std(arr),
        'max': np.max(arr),
        'min_index': np.argmin(arr),
        'sum': np.sum(arr)
    }

# Test
arr_1d, arr_zeros, arr_ones, arr_range, arr_identity = create_arrays()
print("1D Array:", arr_1d)
print("Identity:")
print(arr_identity)

test_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print("\nArray stats:", array_operations(test_arr))

## Exercise 2: Array Manipulation - SOLUTION

In [None]:
def reshape_and_slice():
    """Practice array reshaping and slicing."""
    arr = np.arange(12)
    reshaped_3x4 = arr.reshape(3, 4)
    reshaped_4x3 = arr.reshape(4, 3)
    reshaped_3d = arr.reshape(2, 2, 3)
    return reshaped_3x4, reshaped_4x3, reshaped_3d

def array_slicing(arr):
    """Practice array slicing on a 2D array."""
    first_row = arr[0, :]
    last_col = arr[:, -1]
    top_left = arr[:2, :2]
    every_other = arr[0, ::2]
    return first_row, last_col, top_left, every_other

# Test
r1, r2, r3 = reshape_and_slice()
print("3x4 reshape:")
print(r1)

test_matrix = np.arange(16).reshape(4, 4)
print("\nTest matrix:")
print(test_matrix)
first_row, last_col, top_left, every_other = array_slicing(test_matrix)
print("\nFirst row:", first_row)
print("Last column:", last_col)

## Exercise 3: Pandas DataFrames - SOLUTION

In [None]:
# Create sample dataset
np.random.seed(42)
n_samples = 100

data = {
    'age': np.random.randint(18, 65, n_samples),
    'income': np.random.normal(50000, 15000, n_samples).astype(int),
    'education_years': np.random.randint(8, 22, n_samples),
    'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston'], n_samples),
    'purchased': np.random.choice([0, 1], n_samples, p=[0.6, 0.4])
}

df = pd.DataFrame(data)
df.head()

In [None]:
def explore_dataframe(df):
    stats = df.describe()
    dtypes = df.dtypes
    missing = df.isnull().sum()
    return stats, dtypes, missing

def filter_and_group(df):
    older_than_30 = df[df['age'] > 30]
    high_income_buyers = df[(df['income'] > 60000) & (df['purchased'] == 1)]
    income_by_city = df.groupby('city')['income'].mean()
    purchases_by_city = df.groupby('city')['purchased'].sum()
    return older_than_30, high_income_buyers, income_by_city, purchases_by_city

# Test
stats, dtypes, missing = explore_dataframe(df)
print("Statistics:")
print(stats)

older, high_buyers, income_city, purch_city = filter_and_group(df)
print(f"\nPeople older than 30: {len(older)}")
print(f"\nIncome by city:\n{income_city}")

## Exercise 4: Data Visualization - SOLUTION

In [None]:
def create_histogram(df, column):
    plt.hist(df[column], bins=20, edgecolor='black', alpha=0.7)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column}')

def create_scatter_plot(df, x_col, y_col, color_col=None):
    if color_col:
        colors = df[color_col]
        plt.scatter(df[x_col], df[y_col], c=colors, cmap='viridis', alpha=0.6)
        plt.colorbar(label=color_col)
    else:
        plt.scatter(df[x_col], df[y_col], alpha=0.6)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f'{x_col} vs {y_col}')

def create_bar_chart(df, category_col, value_col):
    grouped = df.groupby(category_col)[value_col].mean()
    grouped.plot(kind='bar', edgecolor='black')
    plt.xlabel(category_col)
    plt.ylabel(f'Mean {value_col}')
    plt.title(f'Mean {value_col} by {category_col}')
    plt.xticks(rotation=45)

# Test
plt.figure(figsize=(15, 4))
plt.subplot(1, 3, 1)
create_histogram(df, 'income')
plt.subplot(1, 3, 2)
create_scatter_plot(df, 'age', 'income', 'purchased')
plt.subplot(1, 3, 3)
create_bar_chart(df, 'city', 'income')
plt.tight_layout()
plt.show()

## Exercise 5: Data Preprocessing - SOLUTION

In [None]:
def normalize_column(arr):
    """Min-max normalization to [0, 1]."""
    return (arr - arr.min()) / (arr.max() - arr.min())

def standardize_column(arr):
    """Z-score standardization."""
    return (arr - arr.mean()) / arr.std()

def one_hot_encode(df, column):
    """One-hot encode a categorical column."""
    return pd.get_dummies(df, columns=[column])

# Test
test_data = np.array([10, 20, 30, 40, 50])
print("Original:", test_data)
print("Normalized:", normalize_column(test_data))

standardized = standardize_column(test_data)
print("Standardized:", standardized)
print(f"  Mean: {standardized.mean():.4f}, Std: {standardized.std():.4f}")

encoded = one_hot_encode(df, 'city')
print("\nOne-hot encoded columns:")
print(encoded.head())

## Checkpoint

Lab 1 complete! **Next:** Lab 2 - Machine Learning Basics