# Financial Analysis with Data Science & Machine Learning - Part 3
## Clustering Analysis and Principal Component Analysis (PCA)

This notebook applies unsupervised learning techniques to identify patterns and group companies based on their financial characteristics.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from yellowbrick.cluster import KElbowVisualizer
import plotly.express as px
import plotly.graph_objects as go

# Set visualization style
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# Display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Load the data with financial ratios from the previous notebook
try:
    data = pd.read_csv('financial_data_with_ratios.csv')
    print(f"Successfully loaded data with {data.shape[0]} rows and {data.shape[1]} columns")
except FileNotFoundError:
    print("Financial data with ratios file not found. Please run the previous notebooks first.")
    # Try to load the cleaned data if the ratios file is not available
    try:
        data = pd.read_csv('cleaned_financial_data.csv')
        print(f"Loaded cleaned data instead with {data.shape[0]} rows and {data.shape[1]} columns")
    except FileNotFoundError:
        print("No data files found. Please run the previous notebooks to generate the necessary data.")

## 2. Data Preparation for Machine Learning

In [None]:
# Function to prepare data for clustering and PCA
def prepare_data_for_ml(df):
    """Prepare financial data for machine learning analysis
    
    Parameters:
    -----------
    df : pandas DataFrame
        Dataframe containing financial data and ratios
        
    Returns:
    --------
    tuple : (X_scaled, feature_names, df_prepared)
        - X_scaled: Scaled features for ML
        - feature_names: List of feature names
        - df_prepared: Processed dataframe with company identifiers
    """
    # Create a copy of the dataframe
    df_ml = df.copy()
    
    # Identify categorical and identifier columns
    id_cols = ['Company Name', 'Ticker']
    cat_cols = ['Sector'] if 'Sector' in df_ml.columns else []
    
    # List of columns that should be available in the dataset
    # Add additional financial ratios if they exist
    potential_feature_cols = [
        'Total Revenue', 'Gross Profit', 'Operating Income', 'Net Income',
        'Total Assets', 'Total Liabilities', 'Equity', 'Cash and Cash Equivalents',
        'Market Capitalization', 'ROA', 'ROE', 'Net_Margin', 'Gross_Margin',
        'Operating_Margin', 'Debt_to_Equity', 'Debt_Ratio', 'Asset_Turnover'
    ]
    
    # Select only features that exist in the dataframe
    feature_cols = [col for col in potential_feature_cols if col in df_ml.columns]
    
    if not feature_cols:
        raise ValueError("No valid feature columns found in the dataset")
    
    print(f"Selected {len(feature_cols)} features for analysis")
    
    # Check for missing values in the selected features
    missing_values = df_ml[feature_cols].isnull().sum()
    features_with_missing = missing_values[missing_values > 0]
    
    if not features_with_missing.empty:
        print("Features with missing values:")
        print(features_with_missing)
        
        # Impute missing values using median
        imputer = SimpleImputer(strategy='median')
        df_ml[feature_cols] = imputer.fit_transform(df_ml[feature_cols])
        print("Missing values imputed with median")
    
    # Select relevant columns for ML
    # Convert to numeric and replace infinite values with NaN
    X = df_ml[feature_cols].apply(pd.to_numeric, errors='coerce').replace([np.inf, -np.inf], np.nan)
    
    # Drop any rows with NaN values after conversion
    rows_before = X.shape[0]
    X = X.dropna()
    rows_after = X.shape[0]
    
    if rows_before > rows_after:
        print(f"Dropped {rows_before - rows_after} rows with NaN values")
        # Get indices of valid rows
        valid_indices = X.index
        # Filter the original dataframe
        df_ml = df_ml.loc[valid_indices]
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print(f"Prepared data shape: {X_scaled.shape}")
    
    # Create a dataframe with company identifiers and scaled features
    id_df = df_ml[id_cols + cat_cols] if cat_cols else df_ml[id_cols]
    
    return X_scaled, feature_cols, id_df

In [None]:
# Prepare data for machine learning
try:
    X_scaled, feature_names, companies = prepare_data_for_ml(data)
    print(f"Data prepared successfully with {len(feature_names)} features for {X_scaled.shape[0]} companies")
except Exception as e:
    print(f"Error preparing data: {e}")

## 3. Principal Component Analysis (PCA)

In [None]:
# Perform PCA to reduce dimensionality
def perform_pca(X_scaled, feature_names, n_components=0.95):
    """Perform PCA on scaled financial data
    
    Parameters:
    -----------
    X_scaled : numpy array
        Scaled features for PCA
    feature_names : list
        List of feature names
    n_components : int or float, optional (default=0.95)
        Number of components to keep or variance threshold
        
    Returns:
    --------
    tuple : (pca, X_pca)
        - pca: Fitted PCA model
        - X_pca: Transformed data
    """
    # Initialize PCA
    pca = PCA(n_components=n_components)
    
    # Fit and transform the data
    X_pca = pca.fit_transform(X_scaled)
    
    # Print explained variance
    print(f"PCA with {pca.n_components_} components explains {pca.explained_variance_ratio_.sum()*100:.2f}% of the variance")
    
    return pca, X_pca

In [None]:
# Perform PCA
try:
    pca, X_pca = perform_pca(X_scaled, feature_names)
    
    # Display the explained variance ratio
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Explained Variance by Principal Component')
    plt.xticks(range(1, pca.n_components_ + 1))
    plt.tight_layout()
    plt.show()
    
    # Cumulative explained variance
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, pca.n_components_ + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Cumulative Explained Variance by Component')
    plt.axhline(y=0.95, color='r', linestyle='-', label='95% Explained Variance')
    plt.xticks(range(1, pca.n_components_ + 1))
    plt.legend()
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error performing PCA: {e}")

In [None]:
# Analyze PCA components
try:
    # Create a dataframe of component loadings
    loadings = pd.DataFrame(
        pca.components_.T,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)],
        index=feature_names
    )
    
    # Display loadings for the first few principal components
    print("PCA Component Loadings (first 3 components):")
    loadings.iloc[:, :3]
    
    # Visualize component loadings for top 2 components
    plt.figure(figsize=(12, 10))
    sns.heatmap(loadings.iloc[:, :2], annot=True, cmap='coolwarm', fmt='.3f')
    plt.title('Feature Loadings for First Two Principal Components')
    plt.tight_layout()
    plt.show()
    
    # Identify important features for each component
    for i in range(min(3, pca.n_components_)):
        component = loadings[f'PC{i+1}']
        # Sort by absolute loading value
        sorted_loadings = component.abs().sort_values(ascending=False)
        
        print(f"\nPC{i+1} - Top 5 features with highest absolute loadings:")
        for feature, loading in sorted_loadings.head(5).items():
            actual_loading = component[feature]
            print(f"{feature}: {actual_loading:.3f}")
except Exception as e:
    print(f"Error analyzing PCA components: {e}")

## 4. K-Means Clustering

In [None]:
# Determine the optimal number of clusters using the Elbow Method
try:
    # Create figure for the elbow method visualizer
    plt.figure(figsize=(10, 6))
    
    # Instantiate the KElbowVisualizer with KMeans
    visualizer = KElbowVisualizer(KMeans(random_state=42), k=(2, 10))
    
    # Fit the visualizer
    visualizer.fit(X_scaled)
    visualizer.show()
    
    # Get the optimal number of clusters
    optimal_k = visualizer.elbow_value_
    print(f"Optimal number of clusters (K): {optimal_k}")
except Exception as e:
    print(f"Error determining optimal clusters: {e}")
    # Set a default value if the elbow method fails
    optimal_k = 3
    print(f"Using default number of clusters (K): {optimal_k}")

In [None]:
# Perform K-Means clustering with the optimal number of clusters
try:
    # Fit KMeans
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    
    # Add cluster labels to the companies dataframe
    companies_with_clusters = companies.copy()
    companies_with_clusters['Cluster'] = clusters
    
    # Count companies in each cluster
    cluster_counts = companies_with_clusters['Cluster'].value_counts().sort_index()
    print("\nCompanies per cluster:")
    for cluster, count in cluster_counts.items():
        print(f"Cluster {cluster}: {count} companies")
    
    # Visualize the clusters in 2D PCA space
    if X_pca.shape[1] >= 2:
        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.7)
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.title('K-Means Clusters in PCA Space')
        plt.colorbar(scatter, label='Cluster')
        
        # Add cluster centers (transformed to PCA space)
        centers_pca = pca.transform(kmeans.cluster_centers_)
        plt.scatter(centers_pca[:, 0], centers_pca[:, 1], s=200, marker='X', c='red', label='Centroids')
        plt.legend()
        plt.tight_layout()
        plt.show()
        
        # If we have 3 PCA components, create a 3D visualization
        if X_pca.shape[1] >= 3:
            fig = px.scatter_3d(
                x=X_pca[:, 0], y=X_pca[:, 1], z=X_pca[:, 2],
                color=clusters.astype(str),
                title='K-Means Clusters in 3D PCA Space',
                labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3'}
            )
            fig.update_traces(marker=dict(size=5))
            fig.show()
except Exception as e:
    print(f"Error performing clustering: {e}")

## 5. Cluster Analysis and Interpretation

In [None]:
# Analyze clusters by looking at the average values of features for each cluster
try:
    # Combine original data with cluster labels
    data_with_clusters = data.loc[companies.index].copy()
    data_with_clusters['Cluster'] = clusters
    
    # Calculate average feature values for each cluster
    cluster_profiles = data_with_clusters.groupby('Cluster')[feature_names].mean()
    
    # Display cluster profiles
    print("Cluster profiles (average values of features):")
    cluster_profiles
    
    # Visualize key features across clusters
    plt.figure(figsize=(14, 10))
    # Standardize the values for better visualization
    cluster_profiles_scaled = (cluster_profiles - cluster_profiles.mean()) / cluster_profiles.std()
    sns.heatmap(cluster_profiles_scaled, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Standardized Feature Values by Cluster')
    plt.tight_layout()
    plt.show()
    
    # Radar chart for clusters (using a subset of features for clarity)
    # Select top features based on variance between clusters
    features_variance = cluster_profiles.var()
    top_features = features_variance.sort_values(ascending=False).head(6).index.tolist()
    
    # Create radar chart data
    categories = top_features
    fig = go.Figure()
    
    for cluster in cluster_profiles.index:
        values = cluster_profiles.loc[cluster, top_features].values.tolist()
        # Close the polygon by repeating the first value
        values.append(values[0])
        categories_closed = categories + [categories[0]]
        
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=categories_closed,
            name=f'Cluster {cluster}'
        ))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True)),
        title='Key Features by Cluster - Radar Chart',
        showlegend=True
    )
    fig.show()
except Exception as e:
    print(f"Error analyzing clusters: {e}")

In [None]:
# Analyze sectors by cluster (if sector information is available)
if 'Sector' in companies.columns:
    try:
        # Create a dataframe with sector and cluster information
        sector_cluster = companies.copy()
        sector_cluster['Cluster'] = clusters
        
        # Cross-tabulation of sectors and clusters
        sector_cluster_counts = pd.crosstab(sector_cluster['Sector'], sector_cluster['Cluster'])
        
        # Calculate percentages within each cluster
        sector_cluster_pct = sector_cluster_counts.div(sector_cluster_counts.sum(axis=0), axis=1) * 100
        
        print("Sector distribution by cluster (percentages):")
        sector_cluster_pct
        
        # Visualize sector distribution by cluster
        plt.figure(figsize=(14, 10))
        sns.heatmap(sector_cluster_pct, annot=True, cmap='YlGnBu', fmt='.1f')
        plt.title('Sector Distribution by Cluster (%)')
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Error analyzing sectors by cluster: {e}")
else:
    print("No sector information available in the dataset.")

## 6. Save Clustered Data for Further Analysis

In [None]:
# Save the data with cluster assignments
try:
    # Create a dataframe with all data and cluster assignments
    full_data_with_clusters = data.loc[companies.index].copy()
    full_data_with_clusters['Cluster'] = clusters
    
    # Add PCA components if available
    for i in range(min(3, X_pca.shape[1])):
        full_data_with_clusters[f'PC{i+1}'] = X_pca[:, i]
    
    # Save to CSV
    full_data_with_clusters.to_csv('financial_data_with_clusters.csv', index=False)
    print("Saved data with cluster assignments to 'financial_data_with_clusters.csv'")
except Exception as e:
    print(f"Error saving clustered data: {e}")

## Summary of Findings

In this notebook, we have:
1. Prepared the financial data for machine learning analysis
2. Applied Principal Component Analysis (PCA) to identify the key dimensions of financial variation
3. Used K-Means clustering to segment companies into groups with similar financial characteristics
4. Analyzed and interpreted the resulting clusters

Key insights:
- [The notebook will generate insights based on the actual data]
- [For example: Cluster 0 might represent high-growth companies with high ROE and low debt]
- [Cluster 1 might represent stable companies with moderate growth and low leverage]
- [Cluster 2 might represent companies with higher debt ratios but good profit margins]

## Next Steps

In the next notebook, we will:
1. Apply supervised learning techniques (Decision Trees, Random Forests, Regression models)
2. Predict important financial variables
3. Identify the key factors that determine financial performance