# Data Analysis Example Notebook

This notebook demonstrates a typical data analysis workflow that can be used to test conversion functionality.

## Setup and Data Import

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

In [None]:
# Generate synthetic dataset
np.random.seed(42)
num_samples = 1000

# Create a DataFrame with synthetic data
data = pd.DataFrame({
    'feature1': np.random.normal(0, 1, num_samples),
    'feature2': np.random.normal(5, 2, num_samples),
    'feature3': np.random.exponential(2, num_samples),
    'categorical': np.random.choice(['A', 'B', 'C', 'D'], num_samples),
})

# Create a target variable based on features with some noise
data['target'] = (2 * data['feature1'] - 
                  0.5 * data['feature2'] + 
                  0.7 * data['feature3'] + 
                  np.random.normal(0, 2, num_samples))

# Display the first few rows
data.head()

## Exploratory Data Analysis

In [None]:
# Basic statistics
data.describe()

In [None]:
# Check for missing values
data.isna().sum()

In [None]:
# Look at the categorical distribution
print("Categorical distribution:")
data['categorical'].value_counts()

In [None]:
# Distribution of numerical features
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
sns.histplot(data['feature1'], kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Feature 1 Distribution')

sns.histplot(data['feature2'], kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Feature 2 Distribution')

sns.histplot(data['feature3'], kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Feature 3 Distribution')

sns.histplot(data['target'], kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Target Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Distribution by categorical variable
plt.figure(figsize=(10, 6))
sns.boxplot(x='categorical', y='target', data=data)
plt.title('Target Distribution by Category')
plt.show()

In [None]:
# Correlation heatmap
numeric_data = data.select_dtypes(include=np.number)
plt.figure(figsize=(8, 6))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.scatterplot(x='feature1', y='target', hue='categorical', data=data, ax=axes[0])
axes[0].set_title('Feature 1 vs Target')

sns.scatterplot(x='feature2', y='target', hue='categorical', data=data, ax=axes[1])
axes[1].set_title('Feature 2 vs Target')

sns.scatterplot(x='feature3', y='target', hue='categorical', data=data, ax=axes[2])
axes[2].set_title('Feature 3 vs Target')

plt.tight_layout()
plt.show()

## Data Preprocessing

In [None]:
# Create dummy variables for categorical features
data_encoded = pd.get_dummies(data, columns=['categorical'], drop_first=True)
data_encoded.head()

In [None]:
# Scale numerical features
from sklearn.preprocessing import StandardScaler

features = ['feature1', 'feature2', 'feature3']
scaler = StandardScaler()
data_encoded[features] = scaler.fit_transform(data_encoded[features])

data_encoded.head()

In [None]:
# Separate features and target
X = data_encoded.drop('target', axis=1)
y = data_encoded['target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## Model Training and Evaluation

In [None]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

In [None]:
# Function to evaluate model performance
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    return {
        'training_time': training_time,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'y_test_pred': y_test_pred
    }

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [None]:
# Evaluate all models
results = {}
predictions = {}

for name, model in models.items():
    print(f"Training {name}...")
    model_results = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = model_results
    predictions[name] = model_results['y_test_pred']
    print(f"  RMSE: {model_results['test_rmse']:.4f}")
    print(f"  R²: {model_results['test_r2']:.4f}")
    print()

In [None]:
# Create a results DataFrame
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Training Time (s)': [results[m]['training_time'] for m in results],
    'Train RMSE': [results[m]['train_rmse'] for m in results],
    'Test RMSE': [results[m]['test_rmse'] for m in results],
    'Train R²': [results[m]['train_r2'] for m in results],
    'Test R²': [results[m]['test_r2'] for m in results],
})

results_df.sort_values('Test RMSE')

In [None]:
# Visualize model performance
plt.figure(figsize=(12, 6))

# RMSE comparison
plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='Test RMSE', data=results_df)
plt.title('Test RMSE by Model')
plt.xticks(rotation=45, ha='right')

# R² comparison
plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='Test R²', data=results_df)
plt.title('Test R² by Model')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Compare actual vs predicted values for the best model
best_model = results_df.sort_values('Test RMSE').iloc[0]['Model']
print(f"Best model: {best_model}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions[best_model], alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Actual vs Predicted Values ({best_model})')
plt.show()

## Feature Importance

In [None]:
# Extract feature importance for models that support it
feature_importance = {}

for name, model in models.items():
    if hasattr(model, 'feature_importances_'):
        feature_importance[name] = model.feature_importances_
    elif hasattr(model, 'coef_'):
        feature_importance[name] = np.abs(model.coef_)
    else:
        print(f"{name} doesn't provide feature importance")

In [None]:
# Visualize feature importance
feature_names = X.columns

plt.figure(figsize=(12, 8))
for i, (name, importance) in enumerate(feature_importance.items()):
    plt.subplot(len(feature_importance), 1, i+1)
    
    # Sort features by importance
    sorted_idx = np.argsort(importance)
    pos = np.arange(sorted_idx.shape[0])
    
    plt.barh(pos, importance[sorted_idx], align='center')
    plt.yticks(pos, feature_names[sorted_idx])
    plt.title(f'Feature Importance - {name}')

plt.tight_layout()
plt.show()

## Conclusion

In this notebook, we performed a typical data analysis workflow:

1. Generated and explored a synthetic dataset
2. Performed data preprocessing (encoding, scaling)
3. Trained several regression models
4. Evaluated and compared model performance
5. Analyzed feature importance

This notebook serves as a good example for testing conversion functionality as it contains:
- Markdown cells with different levels of headings
- Code cells with various computational complexities
- Intermediate expressions
- Visualizations (which generate image outputs)
- Tabular data outputs
- A mix of print statements and display outputs