# Financial Analysis with Data Science & Machine Learning - Part 4
## Predictive Modeling with Supervised Learning

This notebook applies supervised learning techniques to identify key factors that determine financial performance and predict financial variables.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import graphviz
from sklearn import tree

# Set visualization style
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# Display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Load the data with cluster assignments from the previous notebook
try:
    data = pd.read_csv('financial_data_with_clusters.csv')
    print(f"Successfully loaded data with {data.shape[0]} rows and {data.shape[1]} columns")
except FileNotFoundError:
    # Try to load the data with ratios if the clustered data is not available
    try:
        data = pd.read_csv('financial_data_with_ratios.csv')
        print(f"Loaded data with ratios instead with {data.shape[0]} rows and {data.shape[1]} columns")
    except FileNotFoundError:
        # Try to load the cleaned data as a last resort
        try:
            data = pd.read_csv('cleaned_financial_data.csv')
            print(f"Loaded cleaned data instead with {data.shape[0]} rows and {data.shape[1]} columns")
        except FileNotFoundError:
            print("No data files found. Please run the previous notebooks to generate the necessary data.")

## 2. Feature Selection and Target Definition

In [None]:
# Define target variables and features for prediction
def prepare_features_targets(df, target_col):
    """Prepare features and target for supervised learning
    
    Parameters:
    -----------
    df : pandas DataFrame
        Dataset containing financial data
    target_col : str
        Name of the target column to predict
        
    Returns:
    --------
    tuple : (X_train, X_test, y_train, y_test, feature_names)
        - Training and testing data splits
        - List of feature names
    """
    # Copy the dataframe
    df_ml = df.copy()
    
    # Check if target column exists
    if target_col not in df_ml.columns:
        raise ValueError(f"Target column '{target_col}' not found in the dataset")
    
    # Define columns to exclude from features
    exclude_cols = [
        'Company Name', 'Ticker', 'Sector',  # Identifiers
        target_col,                          # Target
        'Cluster'                            # Cluster assignment (if exists)
    ]
    
    # Also exclude PCA components if they exist
    pca_cols = [col for col in df_ml.columns if col.startswith('PC')]
    exclude_cols.extend(pca_cols)
    
    # Filter out columns that don't exist
    exclude_cols = [col for col in exclude_cols if col in df_ml.columns]
    
    # Select feature columns (all numeric columns except excluded ones)
    numeric_cols = df_ml.select_dtypes(include=np.number).columns
    feature_cols = [col for col in numeric_cols if col not in exclude_cols]
    
    if not feature_cols:
        raise ValueError("No valid feature columns found after filtering")
    
    print(f"Selected {len(feature_cols)} features for predicting {target_col}")
    
    # Handle missing values in features and target
    # Drop rows with missing values in target column
    df_ml = df_ml.dropna(subset=[target_col])
    
    # For features, fill missing values with median
    for col in feature_cols:
        if df_ml[col].isnull().sum() > 0:
            median_val = df_ml[col].median()
            df_ml[col].fillna(median_val, inplace=True)
    
    # Prepare features and target
    X = df_ml[feature_cols]
    y = df_ml[target_col]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"Training set: {X_train_scaled.shape[0]} samples")
    print(f"Testing set: {X_test_scaled.shape[0]} samples")
    
    return X_train_scaled, X_test_scaled, y_train, y_test, feature_cols

In [None]:
# Define the target variable to predict (select one financial metric)
# Examples: 'Net Income', 'ROE', 'Market Capitalization'
target_variable = 'Net Income'  # Change this as needed

try:
    # Check if the target exists in the dataset
    if target_variable not in data.columns:
        print(f"Target variable '{target_variable}' not found in the dataset")
        # List available numerical columns as potential targets
        num_cols = data.select_dtypes(include=np.number).columns.tolist()
        print("Available numerical columns for prediction:")
        print(num_cols)
        # Select first available suitable target if original target not found
        potential_targets = ['Net Income', 'ROE', 'Operating Income', 'Gross Profit', 'Total Revenue']
        for potential in potential_targets:
            if potential in data.columns:
                target_variable = potential
                print(f"Using '{target_variable}' as the target variable instead")
                break
    
    # Prepare data for modeling
    X_train, X_test, y_train, y_test, feature_names = prepare_features_targets(
        data, target_variable
    )
except Exception as e:
    print(f"Error preparing data for modeling: {e}")

## 3. Decision Tree Analysis

In [None]:
# Train a Decision Tree model
try:
    # Initialize and train a Decision Tree model
    dt_model = DecisionTreeRegressor(max_depth=4, random_state=42)
    dt_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred_dt = dt_model.predict(X_test)
    
    # Evaluate the model
    mse_dt = mean_squared_error(y_test, y_pred_dt)
    rmse_dt = np.sqrt(mse_dt)
    r2_dt = r2_score(y_test, y_pred_dt)
    
    print(f"Decision Tree Performance for predicting {target_variable}:")
    print(f"Root Mean Squared Error: {rmse_dt:.2f}")
    print(f"R² Score: {r2_dt:.4f}")
    
    # Visualize the Decision Tree
    plt.figure(figsize=(20, 10))
    plot_tree(dt_model, feature_names=feature_names, filled=True, rounded=True, fontsize=10)
    plt.title(f"Decision Tree for {target_variable} Prediction")
    plt.tight_layout()
    plt.show()
    
    # Create a more detailed visualization with GraphViz
    dot_data = tree.export_graphviz(
        dt_model,
        out_file=None,
        feature_names=feature_names,
        filled=True,
        rounded=True
    )
    graph = graphviz.Source(dot_data)
    graph
except Exception as e:
    print(f"Error training Decision Tree model: {e}")

## 4. Random Forest Analysis

In [None]:
# Train a Random Forest model
try:
    # Initialize and train a Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred_rf = rf_model.predict(X_test)
    
    # Evaluate the model
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    rmse_rf = np.sqrt(mse_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    
    print(f"Random Forest Performance for predicting {target_variable}:")
    print(f"Root Mean Squared Error: {rmse_rf:.2f}")
    print(f"R² Score: {r2_rf:.4f}")
    
    # Extract feature importance
    feature_importance = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    # Display feature importance
    print("\nFeature importance:")
    feature_importance_df.head(10)  # Show top 10 features
    
    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10))
    plt.title(f"Top 10 Features for Predicting {target_variable}")
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error training Random Forest model: {e}")

## 5. Ridge and Lasso Regression

In [None]:
# Train Ridge and Lasso regression models
try:
    # Initialize and train Ridge Regression model with cross-validation
    ridge_params = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
    ridge_cv = GridSearchCV(Ridge(random_state=42), ridge_params, cv=5, scoring='neg_mean_squared_error')
    ridge_cv.fit(X_train, y_train)
    
    # Best Ridge model
    best_ridge = ridge_cv.best_estimator_
    ridge_alpha = ridge_cv.best_params_['alpha']
    y_pred_ridge = best_ridge.predict(X_test)
    
    # Initialize and train Lasso Regression model with cross-validation
    lasso_params = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
    lasso_cv = GridSearchCV(Lasso(random_state=42), lasso_params, cv=5, scoring='neg_mean_squared_error')
    lasso_cv.fit(X_train, y_train)
    
    # Best Lasso model
    best_lasso = lasso_cv.best_estimator_
    lasso_alpha = lasso_cv.best_params_['alpha']
    y_pred_lasso = best_lasso.predict(X_test)
    
    # Evaluate the models
    print("\nRidge Regression:")
    print(f"Best alpha: {ridge_alpha}")
    ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
    ridge_r2 = r2_score(y_test, y_pred_ridge)
    print(f"RMSE: {ridge_rmse:.2f}")
    print(f"R²: {ridge_r2:.4f}")
    
    print("\nLasso Regression:")
    print(f"Best alpha: {lasso_alpha}")
    lasso_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
    lasso_r2 = r2_score(y_test, y_pred_lasso)
    print(f"RMSE: {lasso_rmse:.2f}")
    print(f"R²: {lasso_r2:.4f}")
    
    # Compare coefficients
    ridge_coef = pd.DataFrame({
        'Feature': feature_names,
        'Ridge_Coefficient': best_ridge.coef_
    })
    
    lasso_coef = pd.DataFrame({
        'Feature': feature_names,
        'Lasso_Coefficient': best_lasso.coef_
    })
    
    # Merge coefficients
    coef_df = pd.merge(ridge_coef, lasso_coef, on='Feature')
    
    # Sort by absolute Ridge coefficient
    coef_df['Abs_Ridge'] = np.abs(coef_df['Ridge_Coefficient'])
    coef_df = coef_df.sort_values('Abs_Ridge', ascending=False).drop('Abs_Ridge', axis=1)
    
    # Display top coefficients
    print("\nTop coefficients:")
    coef_df.head(10)
    
    # Visualize coefficients
    plt.figure(figsize=(12, 10))
    top_coef = coef_df.head(10).copy()
    
    # Melt the dataframe for easier plotting
    top_coef_melted = pd.melt(
        top_coef, 
        id_vars=['Feature'], 
        value_vars=['Ridge_Coefficient', 'Lasso_Coefficient'],
        var_name='Model', 
        value_name='Coefficient'
    )
    
    # Create the plot
    g = sns.catplot(
        data=top_coef_melted, 
        kind='bar',
        x='Coefficient', 
        y='Feature', 
        hue='Model',
        height=8, 
        aspect=1.5
    )
    
    plt.title(f"Top 10 Coefficients for Predicting {target_variable}")
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error training regression models: {e}")

## 6. Model Comparison and Interpretation

In [None]:
# Compare model performance
try:
    # Create a dataframe of model performance metrics
    model_comparison = pd.DataFrame({
        'Model': ['Decision Tree', 'Random Forest', 'Ridge Regression', 'Lasso Regression'],
        'RMSE': [rmse_dt, rmse_rf, ridge_rmse, lasso_rmse],
        'R²': [r2_dt, r2_rf, ridge_r2, lasso_r2]
    })
    
    # Sort by R² (higher is better)
    model_comparison = model_comparison.sort_values('R²', ascending=False)
    
    print("Model performance comparison:")
    model_comparison
    
    # Visualize model comparison
    plt.figure(figsize=(12, 10))
    sns.barplot(x='R²', y='Model', data=model_comparison)
    plt.title(f"Model Performance Comparison for {target_variable} Prediction (R²)")
    plt.tight_layout()
    plt.show()
    
    # RMSE comparison
    plt.figure(figsize=(12, 10))
    sns.barplot(x='RMSE', y='Model', data=model_comparison.sort_values('RMSE'))
    plt.title(f"Model Performance Comparison for {target_variable} Prediction (RMSE)")
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error comparing models: {e}")

## 7. Key Financial Drivers Analysis

In [None]:
# Extract and analyze key financial drivers from the models
try:
    # Combine importance/coefficients from all models
    # Random Forest feature importance
    rf_importance = pd.DataFrame({
        'Feature': feature_names,
        'RF_Importance': rf_model.feature_importances_
    })
    
    # Ridge and Lasso coefficients (absolute values for fair comparison)
    ridge_importance = pd.DataFrame({
        'Feature': feature_names,
        'Ridge_Importance': np.abs(best_ridge.coef_)
    })
    
    lasso_importance = pd.DataFrame({
        'Feature': feature_names,
        'Lasso_Importance': np.abs(best_lasso.coef_)
    })
    
    # Merge all metrics
    drivers_df = rf_importance.merge(ridge_importance, on='Feature')
    drivers_df = drivers_df.merge(lasso_importance, on='Feature')
    
    # Normalize each importance/coefficient column to [0,1] for fair comparison
    for col in ['RF_Importance', 'Ridge_Importance', 'Lasso_Importance']:
        if drivers_df[col].sum() > 0:  # Avoid division by zero
            drivers_df[col] = drivers_df[col] / drivers_df[col].sum()
    
    # Calculate average importance across models
    drivers_df['Average_Importance'] = drivers_df[['RF_Importance', 'Ridge_Importance', 'Lasso_Importance']].mean(axis=1)
    
    # Sort by average importance
    drivers_df = drivers_df.sort_values('Average_Importance', ascending=False)
    
    print("\nTop financial drivers across models:")
    drivers_df.head(10)
    
    # Visualize top drivers
    top_drivers = drivers_df.head(10)
    
    plt.figure(figsize=(14, 10))
    
    # Melt the dataframe for easier plotting
    top_drivers_melted = pd.melt(
        top_drivers, 
        id_vars=['Feature'], 
        value_vars=['RF_Importance', 'Ridge_Importance', 'Lasso_Importance'],
        var_name='Model', 
        value_name='Importance'
    )
    
    # Create the plot
    sns.barplot(x='Importance', y='Feature', hue='Model', data=top_drivers_melted)
    plt.title(f"Top 10 Financial Drivers for {target_variable}")
    plt.legend(title='Model')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error analyzing financial drivers: {e}")

## Summary of Findings

In this notebook, we have:
1. Prepared financial data for supervised learning
2. Built and evaluated predictive models for financial performance
3. Identified key financial drivers using various machine learning techniques
4. Compared the performance of different modeling approaches

Key insights:
- [The notebook will generate insights based on the actual data]
- [For example: The most important factors for predicting Net Income might be ...]
- [Model performance suggests that Random Forest provides the best balance of accuracy and interpretability]
- [The identified financial drivers align with established financial theory by showing ...]

## Next Steps

In the next notebook, we will:
1. Combine insights from clustering and predictive modeling
2. Develop comprehensive economic interpretations of our findings
3. Generate actionable recommendations based on the analysis