# Environment Verification Notebook

## So You Think You Can Data? - Model Validation Course

This notebook verifies that your environment is correctly set up for the course. Run each cell to make sure everything is working properly.

## 1. Import Tests

First, let's verify that all required packages can be imported without errors.

In [None]:
# Core data manipulation libraries
print("Testing core data libraries...")
import numpy as np
import pandas as pd
print(f"✅ NumPy version: {np.__version__}")
print(f"✅ Pandas version: {pd.__version__}")

In [None]:
# Visualization libraries
print("Testing visualization libraries...")
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
print(f"✅ Matplotlib version: {plt.__version__}")
print(f"✅ Seaborn version: {sns.__version__}")
print(f"✅ Plotly version: {px.__version__}")

# Set default styles for later visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context('notebook')

In [None]:
# Machine learning libraries
print("Testing machine learning libraries...")
import sklearn
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    StratifiedKFold, 
    GroupKFold, 
    TimeSeriesSplit,
    cross_val_score
)
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error
print(f"✅ Scikit-learn version: {sklearn.__version__}")

In [None]:
# Statistical analysis libraries
print("Testing statistical libraries...")
import scipy
import statsmodels.api as sm
print(f"✅ SciPy version: {scipy.__version__}")
print(f"✅ Statsmodels version: {sm.__version__}")

In [None]:
# Network analysis libraries
print("Testing network analysis libraries...")
import networkx as nx
print(f"✅ NetworkX version: {nx.__version__}")

In [None]:
# Time series libraries
print("Testing time series libraries...")
try:
    from prophet import Prophet
    print(f"✅ Prophet is installed")
except ImportError:
    print("⚠️ Prophet is not installed correctly. Some time series examples may not work.")
    print("   This is not critical for most of the course.")

## 2. Basic Functionality Tests

Now let's check if we can perform some basic operations with these libraries.

In [None]:
print("Creating and manipulating a pandas DataFrame...")
# Create a sample dataframe
df = pd.DataFrame({
    'A': np.random.randn(5),
    'B': np.random.randn(5),
    'C': ['foo', 'bar', 'baz', 'qux', 'quux']
})

# Show the dataframe
print("Sample DataFrame:")
display(df)

# Perform some basic operations
print("\nDataFrame summary statistics:")
display(df.describe())

print("✅ Pandas operations working correctly")

In [None]:
print("Testing basic plotting functionality...")
plt.figure(figsize=(10, 5))

# Create a simple plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y, label='sin(x)')
plt.plot(x, np.cos(x), label='cos(x)')
plt.title('Basic Matplotlib Plot')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

# Create a seaborn plot
plt.figure(figsize=(10, 5))
sns.histplot(np.random.normal(size=1000), kde=True)
plt.title('Seaborn Histogram with KDE')
plt.show()

print("✅ Plotting functionality working correctly")

In [None]:
print("Testing basic machine learning functionality...")

# Create a synthetic dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

# Train a simple model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

# Test cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")

print("✅ Machine learning functionality working correctly")

## 3. Model Validation Specific Tests

Let's test some of the specific validation techniques we'll use in the course.

In [None]:
print("Testing k-fold cross-validation...")

# Create data
X, y = make_classification(n_samples=100, n_features=5, random_state=42)

# Create KFold validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Visualize the folds
fold_indices = []
for i, (train_idx, test_idx) in enumerate(kf.split(X)):
    fold_indices.append((train_idx, test_idx))
    print(f"Fold {i+1}: Train size = {len(train_idx)}, Test size = {len(test_idx)}")

# Plot the train/test indices for the first 3 folds
plt.figure(figsize=(12, 6))
for i in range(3):
    train_idx, test_idx = fold_indices[i]
    
    plt.subplot(3, 1, i+1)
    plt.scatter(range(len(X)), [i+1] * len(X), c=['blue' if idx in train_idx else 'red' for idx in range(len(X))], 
                marker='|', s=100)
    plt.title(f'Fold {i+1}')
    plt.yticks([])
    if i == 2:
        plt.xlabel('Sample Index')
    plt.legend(['Train', 'Test'], loc='upper right')

plt.tight_layout()
plt.show()

print("✅ K-fold cross-validation is working correctly")

In [None]:
print("Testing time series cross-validation...")

# Create a synthetic time series dataset
dates = pd.date_range(start='2020-01-01', end='2021-12-31', freq='D')
values = np.cumsum(np.random.randn(len(dates)))
ts_data = pd.DataFrame({'date': dates, 'value': values})

# Prepare the data
X = np.arange(len(ts_data)).reshape(-1, 1)  # Simple index feature
y = ts_data['value'].values

# Set up time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Visualize the time series splits
plt.figure(figsize=(12, 8))

# Plot the dataset
plt.subplot(211)
plt.plot(ts_data['date'], ts_data['value'])
plt.title('Time Series Dataset')
plt.xlabel('Date')
plt.ylabel('Value')

# Plot the folds
plt.subplot(212)
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    train_start = min(train_idx)
    train_end = max(train_idx)
    test_start = min(test_idx)
    test_end = max(test_idx)
    
    # Plot train and test indices
    plt.plot([train_start, train_end], [i, i], 'b-', linewidth=2)
    plt.plot([test_start, test_end], [i, i], 'r-', linewidth=2)
    
    print(f"Fold {i+1}: Train = [{train_start}:{train_end}], Test = [{test_start}:{test_end}]")

plt.yticks(range(5), [f'Fold {i+1}' for i in range(5)])
plt.xlabel('Sample Index')
plt.title('Time Series Cross-Validation')
plt.legend(['Training Set', 'Test Set'])
plt.tight_layout()
plt.show()

print("✅ Time series cross-validation is working correctly")

In [None]:
print("Testing group-based cross-validation...")

# Create a synthetic grouped dataset
n_groups = 20
samples_per_group = 5
n_samples = n_groups * samples_per_group

# Create group IDs
group_ids = np.repeat(np.arange(n_groups), samples_per_group)

# Create data with group effects
X = np.random.randn(n_samples, 3)
y = np.random.randint(0, 2, n_samples)  # Binary target

# Set up group-based cross-validation
gkf = GroupKFold(n_splits=5)

# Collect fold information
fold_info = []
for i, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=group_ids)):
    train_groups = set(group_ids[train_idx])
    test_groups = set(group_ids[test_idx])
    
    fold_info.append({
        'fold': i+1,
        'train_samples': len(train_idx),
        'test_samples': len(test_idx),
        'train_groups': len(train_groups),
        'test_groups': len(test_groups),
        'overlap': len(train_groups.intersection(test_groups))
    })
    
    print(f"Fold {i+1}: ")
    print(f"  Train: {len(train_idx)} samples from {len(train_groups)} groups")
    print(f"  Test: {len(test_idx)} samples from {len(test_groups)} groups")
    print(f"  Group overlap: {len(train_groups.intersection(test_groups))}")

# Create a summary dataframe
fold_df = pd.DataFrame(fold_info)
display(fold_df)

# Verify no group appears in both train and test
if fold_df['overlap'].sum() == 0:
    print("✅ Group-based cross-validation is working correctly")
    print("   No group appears in both training and testing sets")
else:
    print("❌ Group-based cross-validation has issues")
    print("   Some groups appear in both training and testing sets")

## 4. Environment Summary

If you've run all cells without errors, your environment is set up correctly for the course!

In [None]:
import platform
import sys

print("Environment Summary")
print("===================\n")
print(f"Python version: {platform.python_version()}")
print(f"Operating System: {platform.system()} {platform.release()}")

print("\nKey Packages:")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"Scikit-learn: {sklearn.__version__}")
print(f"Matplotlib: {plt.__version__}")
print(f"Seaborn: {sns.__version__}")
print(f"SciPy: {scipy.__version__}")
print(f"Statsmodels: {sm.__version__}")
print(f"NetworkX: {nx.__version__}")

print("\n🎉 Congratulations! Your environment is set up correctly for 'So You Think You Can Data?' course!")
print("You're ready to begin the model validation journey.")

## Next Steps

Now that your environment is verified, you're ready to move on to the course content!

1. Check out the introductory notebook in the `/notebooks` folder
2. Watch the course videos in sequence
3. Complete the exercises as you go

If you encountered any issues, please check the course Q&A section or post in the discussion forum for help.