# Lab 2: Machine Learning Basics - SOLUTIONS

**Day 1 - Foundations**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

np.random.seed(42)

## Exercise 1: Data Splitting - SOLUTION

In [None]:
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Test
X = np.random.randn(100, 2)
y = (X[:, 0] + X[:, 1] > 0).astype(int)
X_train, X_test, y_train, y_test = split_data(X, y)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## Exercise 2: Linear Regression from Scratch - SOLUTION

In [None]:
class SimpleLinearRegression:
    def __init__(self):
        self.slope = None
        self.intercept = None
    
    def fit(self, X, y):
        X = X.flatten()
        x_mean = np.mean(X)
        y_mean = np.mean(y)
        
        numerator = np.sum((X - x_mean) * (y - y_mean))
        denominator = np.sum((X - x_mean) ** 2)
        
        self.slope = numerator / denominator
        self.intercept = y_mean - self.slope * x_mean
        return self
    
    def predict(self, X):
        X = X.flatten()
        return self.slope * X + self.intercept

# Test
X_reg = np.random.rand(100, 1) * 10
y_reg = 2.5 * X_reg.flatten() + 5 + np.random.randn(100) * 2

model = SimpleLinearRegression()
model.fit(X_reg, y_reg)
print(f"Slope: {model.slope:.4f} (expected ~2.5)")
print(f"Intercept: {model.intercept:.4f} (expected ~5.0)")

y_pred = model.predict(X_reg)
plt.scatter(X_reg, y_reg, alpha=0.5, label='Data')
plt.plot(X_reg, y_pred, 'r-', linewidth=2, label='Prediction')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

## Exercise 3: Sklearn Regression - SOLUTION

In [None]:
def train_sklearn_regression(X_train, y_train, X_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return model, predictions

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
sk_model, sk_pred = train_sklearn_regression(X_train_reg, y_train_reg, X_test_reg)
print(f"Sklearn slope: {sk_model.coef_[0]:.4f}")
print(f"Sklearn intercept: {sk_model.intercept_:.4f}")

## Exercise 4: Classification - SOLUTION

In [None]:
def train_classifier(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

def get_predictions_and_probabilities(model, X_test):
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test)[:, 1]
    return predictions, probabilities

# Test
X_clf, y_clf = make_classification(n_samples=200, n_features=2, n_redundant=0, 
                                   n_informative=2, n_clusters_per_class=1, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42)

clf = train_classifier(X_train_clf, y_train_clf)
preds, probs = get_predictions_and_probabilities(clf, X_test_clf)
print(f"First 5 predictions: {preds[:5]}")
print(f"First 5 probabilities: {probs[:5].round(3)}")

## Exercise 5: Model Evaluation - SOLUTION

In [None]:
def calculate_regression_metrics(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    return {'MSE': mse, 'RMSE': rmse, 'MAE': mae}

def calculate_classification_metrics(y_true, y_pred):
    accuracy = np.mean(y_true == y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return {'accuracy': accuracy, 'confusion_matrix': conf_matrix}

def plot_confusion_matrix(conf_matrix, classes=['Class 0', 'Class 1']):
    plt.imshow(conf_matrix, cmap='Blues')
    plt.colorbar()
    plt.xticks([0, 1], classes)
    plt.yticks([0, 1], classes)
    for i in range(2):
        for j in range(2):
            plt.text(j, i, conf_matrix[i, j], ha='center', va='center', fontsize=14)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')

# Test
y_pred_reg = sk_model.predict(X_test_reg)
print("Regression Metrics:", calculate_regression_metrics(y_test_reg, y_pred_reg))
print("Classification Metrics:", calculate_classification_metrics(y_test_clf, preds))

plot_confusion_matrix(calculate_classification_metrics(y_test_clf, preds)['confusion_matrix'])
plt.show()

## Exercise 6: Overfitting Demonstration - SOLUTION

In [None]:
def demonstrate_overfitting():
    np.random.seed(42)
    X = np.linspace(0, 4, 30).reshape(-1, 1)
    y = np.sin(X.flatten() * 1.5) + np.random.randn(30) * 0.3
    
    plt.figure(figsize=(15, 4))
    degrees = [1, 4, 15]
    titles = ['Underfitting (degree=1)', 'Good Fit (degree=4)', 'Overfitting (degree=15)']
    
    for i, degree in enumerate(degrees):
        plt.subplot(1, 3, i + 1)
        
        model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
        model.fit(X, y)
        
        X_smooth = np.linspace(0, 4, 100).reshape(-1, 1)
        y_smooth = model.predict(X_smooth)
        
        plt.scatter(X, y, color='blue', alpha=0.5, label='Data')
        plt.plot(X_smooth, y_smooth, 'r-', linewidth=2, label='Prediction')
        plt.title(titles[i])
        plt.xlabel('X')
        plt.ylabel('y')
        plt.legend()
        plt.ylim(-2, 2)
    
    plt.tight_layout()
    plt.show()

demonstrate_overfitting()

## Checkpoint

Lab 2 complete! **Next:** Lab 3 - Neural Networks