In [None]:
# SaaS Pricing Model - Machine Learning Models
# Student: Sanskriti Avinash Dabhade (1225131)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.cluster import KMeans
from sklearn.metrics import (mean_squared_error, r2_score, mean_absolute_error,
                            accuracy_score, precision_score, recall_score, f1_score,
                            confusion_matrix, classification_report, silhouette_score)

# Set styling
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

print("=" * 80)
print("SaaS PRICING MODEL SELECTION - MACHINE LEARNING MODELS")
print("=" * 80)

# ============================================================================
# SECTION 1: LOAD AND PREPARE DATA
# ============================================================================

print("\n1. LOADING CLEANED DATASETS...")

ravenstack = pd.read_csv('Cleaned Data/ravenstack_cleaned.csv')
cac_ltv = pd.read_csv('Cleaned Data/cac_ltv_cleaned.csv')
saas_businesses = pd.read_csv('Cleaned Data/saas_businesses_cleaned.csv')

print(f"✓ Ravenstack: {ravenstack.shape}")
print(f"✓ CAC-LTV: {cac_ltv.shape}")
print(f"✓ SaaS Businesses: {saas_businesses.shape}")

# Create results directory
results_dir = Path('Model Results')
results_dir.mkdir(exist_ok=True)

# ============================================================================
# SECTION 2: FEATURE ENGINEERING
# ============================================================================

print("\n" + "=" * 80)
print("2. FEATURE ENGINEERING")
print("=" * 80)

# 2.1 Ravenstack - Prepare features for modeling
print("\n2.1 Preparing Ravenstack features...")

# Select numerical features
numerical_features = ravenstack.select_dtypes(include=[np.number]).columns.tolist()
print(f"  - Numerical features: {len(numerical_features)}")

# Encode categorical variables
categorical_features = ravenstack.select_dtypes(include=['object']).columns.tolist()
print(f"  - Categorical features: {len(categorical_features)}")

# Create label encoders for categorical variables
label_encoders = {}
for col in categorical_features:
    if ravenstack[col].nunique() < 50:  # Only encode if not too many unique values
        le = LabelEncoder()
        ravenstack[f'{col}_encoded'] = le.fit_transform(ravenstack[col].astype(str))
        label_encoders[col] = le

# 2.2 CAC-LTV - Feature engineering
print("\n2.2 Engineering CAC-LTV features...")

# Calculate LTV/CAC ratio if columns exist
cac_cols = [col for col in cac_ltv.columns if 'cac' in col.lower()]
ltv_cols = [col for col in cac_ltv.columns if 'ltv' in col.lower()]

if len(cac_cols) > 0 and len(ltv_cols) > 0:
    cac_ltv['ltv_cac_ratio'] = cac_ltv[ltv_cols[0]] / (cac_ltv[cac_cols[0]] + 1)  # Add 1 to avoid division by zero
    cac_ltv['is_healthy'] = (cac_ltv['ltv_cac_ratio'] >= 3).astype(int)
    print(f"  ✓ Created LTV/CAC ratio and health indicator")

# ============================================================================
# SECTION 3: MODEL 1 - LINEAR REGRESSION (Price Optimization)
# ============================================================================

print("\n" + "=" * 80)
print("3. MODEL 1: LINEAR REGRESSION - PRICE OPTIMIZATION")
print("=" * 80)

# Identify target and features for pricing model
print("\n3.1 Preparing data for Linear Regression...")

# Find price-related column
price_cols = [col for col in ravenstack.columns if 'price' in col.lower() or 'revenue' in col.lower()]
if len(price_cols) > 0:
    target_col = price_cols[0]
    print(f"  Target variable: {target_col}")
    
    # Select features (numerical only for simplicity)
    feature_cols = [col for col in numerical_features if col != target_col and col in ravenstack.columns][:10]
    
    # Remove rows with missing values in target or features
    lr_data = ravenstack[[target_col] + feature_cols].dropna()
    
    print(f"  Features: {len(feature_cols)}")
    print(f"  Samples: {len(lr_data)}")
    
    # Split data
    X = lr_data[feature_cols]
    y = lr_data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("\n3.2 Training Linear Regression model...")
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred_train = lr_model.predict(X_train_scaled)
    y_pred_test = lr_model.predict(X_test_scaled)
    
    # Evaluation
    print("\n3.3 Linear Regression Results:")
    print(f"  Train R²: {r2_score(y_train, y_pred_train):.4f}")
    print(f"  Test R²: {r2_score(y_test, y_pred_test):.4f}")
    print(f"  Train RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.4f}")
    print(f"  Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}")
    print(f"  Test MAE: {mean_absolute_error(y_test, y_pred_test):.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='r2')
    print(f"  Cross-validation R² (mean ± std): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Coefficient': lr_model.coef_
    }).sort_values('Coefficient', key=abs, ascending=False)
    
    print("\n3.4 Top 5 Most Important Features:")
    print(feature_importance.head())
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Plot 1: Actual vs Predicted
    ax1 = axes[0, 0]
    ax1.scatter(y_test, y_pred_test, alpha=0.5, color='blue')
    ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    ax1.set_xlabel('Actual Price')
    ax1.set_ylabel('Predicted Price')
    ax1.set_title('Linear Regression: Actual vs Predicted', fontweight='bold')
    ax1.grid(alpha=0.3)
    
    # Plot 2: Residuals
    ax2 = axes[0, 1]
    residuals = y_test - y_pred_test
    ax2.scatter(y_pred_test, residuals, alpha=0.5, color='green')
    ax2.axhline(y=0, color='r', linestyle='--', lw=2)
    ax2.set_xlabel('Predicted Price')
    ax2.set_ylabel('Residuals')
    ax2.set_title('Residual Plot', fontweight='bold')
    ax2.grid(alpha=0.3)
    
    # Plot 3: Feature importance
    ax3 = axes[1, 0]
    top_features = feature_importance.head(10)
    ax3.barh(range(len(top_features)), top_features['Coefficient'].abs())
    ax3.set_yticks(range(len(top_features)))
    ax3.set_yticklabels(top_features['Feature'])
    ax3.set_xlabel('Absolute Coefficient Value')
    ax3.set_title('Feature Importance (Top 10)', fontweight='bold')
    ax3.grid(alpha=0.3)
    
    # Plot 4: Distribution of residuals
    ax4 = axes[1, 1]
    ax4.hist(residuals, bins=50, edgecolor='black', color='orange')
    ax4.set_xlabel('Residual')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Distribution of Residuals', fontweight='bold')
    ax4.axvline(x=0, color='r', linestyle='--', lw=2)
    ax4.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('Model Results/linear_regression_results.png', dpi=300, bbox_inches='tight')
    print("\n✓ Saved: Model Results/linear_regression_results.png")
    plt.show()

else:
    print("  ⚠ No price column found in Ravenstack dataset")

# ============================================================================
# SECTION 4: MODEL 2 - POLYNOMIAL REGRESSION
# ============================================================================

print("\n" + "=" * 80)
print("4. MODEL 2: POLYNOMIAL REGRESSION")
print("=" * 80)

if len(price_cols) > 0:
    print("\n4.1 Training Polynomial Regression (degree=2)...")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_test_poly = poly.transform(X_test_scaled)
    
    # Train model
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
    
    # Predictions
    y_pred_train_poly = poly_model.predict(X_train_poly)
    y_pred_test_poly = poly_model.predict(X_test_poly)
    
    # Evaluation
    print("\n4.2 Polynomial Regression Results:")
    print(f"  Train R²: {r2_score(y_train, y_pred_train_poly):.4f}")
    print(f"  Test R²: {r2_score(y_test, y_pred_test_poly):.4f}")
    print(f"  Train RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train_poly)):.4f}")
    print(f"  Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test_poly)):.4f}")
    print(f"  Test MAE: {mean_absolute_error(y_test, y_pred_test_poly):.4f}")
    
    print("\n4.3 Comparison: Linear vs Polynomial Regression")
    print(f"  Linear R²: {r2_score(y_test, y_pred_test):.4f}")
    print(f"  Polynomial R²: {r2_score(y_test, y_pred_test_poly):.4f}")
    print(f"  Improvement: {(r2_score(y_test, y_pred_test_poly) - r2_score(y_test, y_pred_test)):.4f}")

# ============================================================================
# SECTION 5: MODEL 3 - DECISION TREE (Churn Classification)
# ============================================================================

print("\n" + "=" * 80)
print("5. MODEL 3: DECISION TREE - CHURN PREDICTION")
print("=" * 80)

# Find churn-related column
churn_cols = [col for col in ravenstack.columns if 'churn' in col.lower()]

if len(churn_cols) > 0:
    churn_col = churn_cols[0]
    print(f"\n5.1 Preparing data for Decision Tree (target: {churn_col})...")
    
    # Prepare features
    dt_feature_cols = [col for col in numerical_features if col != churn_col and col in ravenstack.columns][:15]
    dt_data = ravenstack[[churn_col] + dt_feature_cols].dropna()
    
    # Ensure binary classification
    if dt_data[churn_col].nunique() == 2:
        X_dt = dt_data[dt_feature_cols]
        y_dt = dt_data[churn_col]
        
        # Split data
        X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(
            X_dt, y_dt, test_size=0.2, random_state=42, stratify=y_dt
        )
        
        # Scale features
        scaler_dt = StandardScaler()
        X_train_dt_scaled = scaler_dt.fit_transform(X_train_dt)
        X_test_dt_scaled = scaler_dt.transform(X_test_dt)
        
        print(f"  Features: {len(dt_feature_cols)}")
        print(f"  Samples: {len(dt_data)}")
        print(f"  Class distribution: {y_dt.value_counts().to_dict()}")
        
        print("\n5.2 Training Decision Tree Classifier...")
        dt_model = DecisionTreeClassifier(max_depth=5, min_samples_split=20, random_state=42)
        dt_model.fit(X_train_dt_scaled, y_train_dt)
        
        # Predictions
        y_pred_train_dt = dt_model.predict(X_train_dt_scaled)
        y_pred_test_dt = dt_model.predict(X_test_dt_scaled)
        
        # Evaluation
        print("\n5.3 Decision Tree Results:")
        print(f"  Train Accuracy: {accuracy_score(y_train_dt, y_pred_train_dt):.4f}")
        print(f"  Test Accuracy: {accuracy_score(y_test_dt, y_pred_test_dt):.4f}")
        print(f"  Test Precision: {precision_score(y_test_dt, y_pred_test_dt, average='weighted', zero_division=0):.4f}")
        print(f"  Test Recall: {recall_score(y_test_dt, y_pred_test_dt, average='weighted', zero_division=0):.4f}")
        print(f"  Test F1-Score: {f1_score(y_test_dt, y_pred_test_dt, average='weighted', zero_division=0):.4f}")
        
        # Cross-validation
        cv_scores_dt = cross_val_score(dt_model, X_train_dt_scaled, y_train_dt, cv=5, scoring='accuracy')
        print(f"  Cross-validation Accuracy (mean ± std): {cv_scores_dt.mean():.4f} ± {cv_scores_dt.std():.4f}")
        
        print("\n5.4 Classification Report:")
        print(classification_report(y_test_dt, y_pred_test_dt, zero_division=0))
        
        # Feature importance
        dt_feature_importance = pd.DataFrame({
            'Feature': dt_feature_cols,
            'Importance': dt_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print("\n5.5 Top 5 Most Important Features:")
        print(dt_feature_importance.head())
        
        # Visualizations
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Plot 1: Confusion Matrix
        ax1 = axes[0, 0]
        cm = confusion_matrix(y_test_dt, y_pred_test_dt)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
        ax1.set_xlabel('Predicted')
        ax1.set_ylabel('Actual')
        ax1.set_title('Confusion Matrix', fontweight='bold')
        
        # Plot 2: Feature Importance
        ax2 = axes[0, 1]
        top_dt_features = dt_feature_importance.head(10)
        ax2.barh(range(len(top_dt_features)), top_dt_features['Importance'])
        ax2.set_yticks(range(len(top_dt_features)))
        ax2.set_yticklabels(top_dt_features['Feature'])
        ax2.set_xlabel('Importance')
        ax2.set_title('Feature Importance (Top 10)', fontweight='bold')
        ax2.grid(alpha=0.3)
        
        # Plot 3: Decision Tree Visualization (simplified)
        ax3 = axes[1, 0]
        plot_tree(dt_model, max_depth=2, feature_names=dt_feature_cols[:10], 
                 filled=True, ax=ax3, fontsize=8)
        ax3.set_title('Decision Tree Structure (depth=2)', fontweight='bold')
        
        # Plot 4: Class distribution
        ax4 = axes[1, 1]
        class_counts = pd.DataFrame({
            'Actual': y_test_dt.value_counts(),
            'Predicted': pd.Series(y_pred_test_dt).value_counts()
        })
        class_counts.plot(kind='bar', ax=ax4, color=['steelblue', 'coral'])
        ax4.set_xlabel('Class')
        ax4.set_ylabel('Count')
        ax4.set_title('Actual vs Predicted Class Distribution', fontweight='bold')
        ax4.tick_params(axis='x', rotation=0)
        ax4.legend()
        ax4.grid(alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('Model Results/decision_tree_results.png', dpi=300, bbox_inches='tight')
        print("\n✓ Saved: Model Results/decision_tree_results.png")
        plt.show()
    else:
        print(f"  ⚠ Churn column has {dt_data[churn_col].nunique()} unique values (expected 2 for binary classification)")
else:
    print("  ⚠ No churn column found in Ravenstack dataset")

# ============================================================================
# SECTION 6: MODEL 4 - K-MEANS CLUSTERING (Customer Segmentation)
# ============================================================================

print("\n" + "=" * 80)
print("6. MODEL 4: K-MEANS CLUSTERING - CUSTOMER SEGMENTATION")
print("=" * 80)

print("\n6.1 Preparing data for K-Means clustering...")

# Select numerical features for clustering
cluster_features = ravenstack.select_dtypes(include=[np.number]).columns.tolist()[:10]
cluster_data = ravenstack[cluster_features].dropna()

print(f"  Features: {len(cluster_features)}")
print(f"  Samples: {len(cluster_data)}")

# Scale features
scaler_kmeans = StandardScaler()
X_cluster = scaler_kmeans.fit_transform(cluster_data)

# Determine optimal number of clusters using elbow method
print("\n6.2 Finding optimal number of clusters...")
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_cluster, kmeans.labels_))

# Train final model with optimal k
optimal_k = 4  # Can be adjusted based on elbow plot
print(f"\n6.3 Training K-Means with k={optimal_k}...")

kmeans_model = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans_model.fit_predict(X_cluster)

# Add cluster labels to data
cluster_data['Cluster'] = clusters

# Evaluation
print(f"\n6.4 K-Means Clustering Results:")
print(f"  Optimal K: {optimal_k}")
print(f"  Inertia: {kmeans_model.inertia_:.2f}")
print(f"  Silhouette Score: {silhouette_score(X_cluster, clusters):.4f}")

print(f"\n6.5 Cluster Distribution:")
print(pd.Series(clusters).value_counts().sort_index())

# Cluster statistics
print(f"\n6.6 Cluster Characteristics:")
cluster_summary = cluster_data.groupby('Cluster')[cluster_features].mean()
print(cluster_summary)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Elbow plot
ax1 = axes[0, 0]
ax1.plot(K_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (K)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method', fontweight='bold')
ax1.grid(alpha=0.3)
ax1.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal K={optimal_k}')
ax1.legend()

# Plot 2: Silhouette scores
ax2 = axes[0, 1]
ax2.plot(K_range, silhouette_scores, 'go-')
ax2.set_xlabel('Number of Clusters (K)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score Analysis', fontweight='bold')
ax2.grid(alpha=0.3)
ax2.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal K={optimal_k}')
ax2.legend()

# Plot 3: Cluster visualization (first 2 PCA components)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster)

ax3 = axes[1, 0]
scatter = ax3.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
ax3.scatter(pca.transform(kmeans_model.cluster_centers_)[:, 0],
           pca.transform(kmeans_model.cluster_centers_)[:, 1],
           c='red', marker='X', s=200, edgecolors='black', label='Centroids')
ax3.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
ax3.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
ax3.set_title('Customer Segments (PCA Visualization)', fontweight='bold')
ax3.legend()
plt.colorbar(scatter, ax=ax3, label='Cluster')

# Plot 4: Cluster sizes
ax4 = axes[1, 1]
cluster_counts = pd.Series(clusters).value_counts().sort_index()
ax4.bar(cluster_counts.index, cluster_counts.values, color='teal')
ax4.set_xlabel('Cluster')
ax4.set_ylabel('Number of Customers')
ax4.set_title('Cluster Size Distribution', fontweight='bold')
ax4.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('Model Results/kmeans_clustering_results.png', dpi=300, bbox_inches='tight')
print("\n✓ Saved: Model Results/kmeans_clustering_results.png")
plt.show()

# ============================================================================
# SECTION 7: INDUSTRY SEGMENTATION (SaaS Businesses Dataset)
# ============================================================================

print("\n" + "=" * 80)
print("7. INDUSTRY SEGMENTATION - SAAS BUSINESSES")
print("=" * 80)

print("\n7.1 Clustering SaaS companies by characteristics...")

# Prepare numerical features from SaaS businesses
business_numerical = saas_businesses.select_dtypes(include=[np.number]).columns.tolist()

if len(business_numerical) > 0:
    business_cluster_data = saas_businesses[business_numerical].dropna()
    
    if len(business_cluster_data) > optimal_k:
        # Scale features
        X_business = scaler_kmeans.fit_transform(business_cluster_data)
        
        # Cluster
        business_kmeans = KMeans(n_clusters=min(optimal_k, len(business_cluster_data)//10), 
                                random_state=42, n_init=10)
        business_clusters = business_kmeans.fit_predict(X_business)
        
        print(f"  Features: {len(business_numerical)}")
        print(f"  Samples: {len(business_cluster_data)}")
        print(f"  Silhouette Score: {silhouette_score(X_business, business_clusters):.4f}")
        
        print(f"\n7.2 Business Cluster Distribution:")
        print(pd.Series(business_clusters).value_counts().sort_index())
    else:
        print("  ⚠ Insufficient samples for clustering")
else:
    print("  ⚠ No numerical features found in SaaS businesses dataset")

# ============================================================================
# SECTION 8: MODEL COMPARISON SUMMARY
# ============================================================================

print("\n" + "=" * 80)
print("8. MODEL PERFORMANCE SUMMARY")
print("=" * 80)

# Create summary table
summary_data = {
    'Model': [],
    'Task': [],
    'Primary Metric': [],
    'Score': [],
    'Status': []
}

if len(price_cols) > 0:
    summary_data['Model'].append('Linear Regression')
    summary_data['Task'].append('Price Prediction')
    summary_data['Primary Metric'].append('Test R²')
    summary_data['Score'].append(f"{r2_score(y_test, y_pred_test):.4f}")
    summary_data['Status'].append('✓ Complete')
    
    summary_data['Model'].append('Polynomial Regression')
    summary_data['Task'].append('Price Prediction')
    summary_data['Primary Metric'].append('Test R²')
    summary_data['Score'].append(f"{r2_score(y_test, y_pred_test_poly):.4f}")
    summary_data['Status'].append('✓ Complete')

if len(churn_cols) > 0 and 'y_test_dt' in locals():
    summary_data['Model'].append('Decision Tree')
    summary_data['Task'].append('Churn Classification')
    summary_data['Primary Metric'].append('Test Accuracy')
    summary_data['Score'].append(f"{accuracy_score(y_test_dt, y_pred_test_dt):.4f}")
    summary_data['Status'].append('✓ Complete')

summary_data['Model'].append('K-Means Clustering')
summary_data['Task'].append('Customer Segmentation')
summary_data['Primary Metric'].append('Silhouette Score')
summary_data['Score'].append(f"{silhouette_score(X_cluster, clusters):.4f}")
summary_data['Status'].append('✓ Complete')

summary_df = pd.DataFrame(summary_data)
print("\n" + summary_df.to_string(index=False))

# Save summary
summary_df.to_csv('Model Results/model_summary.csv', index=False)
print("\n✓ Saved: Model Results/model_summary.csv")

print("\n" + "=" * 80)
print("✓ ALL MODELS TRAINED AND EVALUATED SUCCESSFULLY!")
print("=" * 80)

print("\nKey Findings:")
print("  1. Pricing models help identify optimal price points")
print("  2. Churn prediction identifies at-risk customers")
print("  3. Customer segmentation reveals distinct user groups")
print("  4. Industry clustering shows pricing strategy variations")

print("\nNext Steps:")
print("  • Validate findings with CAC-LTV financial metrics")
print("  • Deploy best-performing models")
print("  • Create pricing recommendations framework")
print("  • Monitor model performance over time")

SaaS PRICING MODEL SELECTION - MACHINE LEARNING MODELS

1. LOADING CLEANED DATASETS...
✓ Ravenstack: (5000, 26)
✓ CAC-LTV: (7057, 15)
✓ SaaS Businesses: (126, 21)

2. FEATURE ENGINEERING

2.1 Preparing Ravenstack features...
  - Numerical features: 7
  - Categorical features: 12

2.2 Engineering CAC-LTV features...

3. MODEL 1: LINEAR REGRESSION - PRICE OPTIMIZATION

3.1 Preparing data for Linear Regression...
  ⚠ No price column found in Ravenstack dataset

4. MODEL 2: POLYNOMIAL REGRESSION

5. MODEL 3: DECISION TREE - CHURN PREDICTION

5.1 Preparing data for Decision Tree (target: churn_flag_x)...


NameError: name 'scaler' is not defined