# Lecture 11: Advanced Machine Learning Topics

## Exercise 1: Implement Stratified K-Fold Cross-Validation

**Task**: Write a function ```stratified_kfold(data, labels, k)``` that splits the data into k folds while maintaining the class distribution in each fold.

- Input:

    - data: List of data samples (e.g., [[1, 2], [3, 4], ...]).
    
    - labels: List of class labels (e.g., [0, 1, 1, 0, ...]).
    
    - k: Number of folds.

- Output: A list of k tuples. Each tuple contains (train_indices, test_indices).

**Example**:

```python
# Input
data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
labels = [0, 1, 0, 1, 0]
k = 2

# Output: 
[([2, 4], [0, 1, 3]), ([0, 1, 3], [2, 4])]
```

In [None]:
from collections import defaultdict

def stratified_kfold(data, labels, k):
    class_indices = defaultdict(list)
    for i, label in enumerate(labels):
        class_indices[label].append(i)

    folds = [[] for _ in range(k)]
    for indices in class_indices.values():
        for i, index in enumerate(indices):
            folds[i % k].append(index)

    result = []
    for i in range(k):
        test_indices = folds[i]
        train_indices = [idx for j, fold in enumerate(folds) if j != i for idx in fold]
        result.append((train_indices, test_indices))
    return result

In [None]:
def test_stratified_kfold_balanced_classes():
    data = [[1, 2], [3, 4], [5, 6], [7, 8]]
    labels = [0, 1, 0, 1]
    k = 2
    result = stratified_kfold(data, labels, k)
    assert len(result) == 2
    assert all(len(set(train).intersection(set(test))) == 0 for train, test in result)

def test_stratified_kfold_imbalanced_classes():
    data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    labels = [0, 1, 0, 1, 0]
    k = 2
    result = stratified_kfold(data, labels, k)
    assert len(result) == 2
    assert len(result[0][1]) == 2
    assert len(result[1][1]) == 3

def test_stratified_kfold_single_class():
    data = [[1, 2], [3, 4], [5, 6]]
    labels = [1, 1, 1]
    k = 2
    result = stratified_kfold(data, labels, k)
    assert len(result) == 2
    assert len(result[0][1]) == 2

## Exercise 2: Implement SMOTE

**Task**: Write a function smote(data, labels, target_class) to generate synthetic samples for the target_class.

- Input:

    - data: List of data samples (e.g., [[1, 2], [3, 4], ...]).
    
    - labels: List of class labels (e.g., [0, 1, 1, 0, ...]).
    
    - target_class: The class for which synthetic samples will be generated.

- Output: Tuple of updated data and labels including synthetic samples.

**Example**:

```python
# Input
data = [[1, 2], [3, 4], [5, 6], [7, 8]]
labels = [0, 1, 0, 1]
target_class = 0

# Output: 
([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4]], [0, 1, 0, 1, 0])
```

In [None]:
import random

def smote(data, labels, target_class):
    target_indices = [i for i, label in enumerate(labels) if label == target_class]
    synthetic_data = []
    while len(synthetic_data) < len(labels) - len(target_indices):
        i, j = random.sample(target_indices, 2)
        new_sample = [(x + y) / 2 for x, y in zip(data[i], data[j])]
        synthetic_data.append(new_sample)

    synthetic_labels = [target_class] * len(synthetic_data)
    return data + synthetic_data, labels + synthetic_labels

In [None]:
def test_smote_balanced():
    data = [[1, 2], [3, 4]]
    labels = [0, 1]
    result_data, result_labels = smote(data, labels, 0)
    assert len(result_data) == 3
    assert result_labels.count(0) == 2

def test_smote_imbalanced():
    data = [[1, 2], [3, 4], [5, 6]]
    labels = [0, 1, 0]
    result_data, result_labels = smote(data, labels, 0)
    assert len(result_data) == 4
    assert result_labels.count(0) == 3

def test_smote_multiple_classes():
    data = [[1, 2], [3, 4], [5, 6]]
    labels = [0, 1, 1]
    result_data, result_labels = smote(data, labels, 1)
    assert len(result_data) > len(data)
    assert result_labels.count(1) > labels.count(1)


## Exercise 3: Build a Data Pipeline

**Task**: Write a function build_pipeline() to create a pipeline that performs the following steps:

1. Imputes missing values.

2. Standardizes numerical data.

3. Encodes categorical data.

4. Trains a logistic regression model.

**Input and Output Format**:

- Input: Dataset as a Pandas DataFrame.

- Output: Trained pipeline ready for predictions.

**Example**:

```python
# Input
data = pd.DataFrame({
    'age': [25, None, 30],
    'gender': ['M', 'F', 'M'],
    'income': [50000, 60000, None],
    'target': [1, 0, 1]
})

# Output: Fitted pipeline object.
```

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def build_pipeline():
    numeric_features = ['age', 'income']
    categorical_features = ['gender']

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])
    return pipeline

In [None]:
def test_pipeline_fit():
    pipeline = build_pipeline()
    data = pd.DataFrame({
        'age': [25, None, 30],
        'gender': ['M', 'F', 'M'],
        'income': [50000, 60000, None],
        'target': [1, 0, 1]
    })
    pipeline.fit(data[['age', 'gender', 'income']], data['target'])

def test_pipeline_transform():
    pipeline = build_pipeline()
    data = pd.DataFrame({
        'age': [25, None, 30],
        'gender': ['M', 'F', 'M'],
        'income': [50000, 60000, None],
        'target': [1, 0, 1]
    })
    pipeline.fit(data[['age', 'gender', 'income']], data['target'])
    transformed = pipeline.named_steps['preprocessor'].transform(data[['age', 'gender', 'income']])
    assert transformed.shape[1] > 0

def test_pipeline_prediction():
    pipeline = build_pipeline()
    data = pd.DataFrame({
        'age': [25, None, 30],
        'gender': ['M', 'F', 'M'],
        'income': [50000, 60000, None],
        'target': [1, 0, 1]
    })
    pipeline.fit(data[['age', 'gender', 'income']], data['target'])
    predictions = pipeline.predict([[30, 'F', 70000]])
    assert len(predictions) == 1