In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# --- 1. Interpretation of K-Means Clusters ---
def analyze_clusters(kmeans_model, vectorizer, final_df, K=5):
    """Interprets K-Means clusters by finding the top words in each cluster center."""
    
    print("\n--- Cluster Analysis: Customer Segments ---")
    print("Cluster Distribution:")
    print(final_df['cluster'].value_counts())
    
    vectorizer_names = vectorizer.get_feature_names_out()
    cluster_centers = kmeans_model.cluster_centers_

    print("\nTop words (features) for each cluster:")
    for i in range(K):
        # Get indices of the top 10 highest TF-IDF scores for this cluster center
        top_indices = cluster_centers[i].argsort()[-10:][::-1]
        top_features = [vectorizer_names[j] for j in top_indices]
        print(f"Cluster {i}: {', '.join(top_features)}")
    

# --- 2. Interpretation of Logistic Regression Coefficients ---
def analyze_feature_importance(lr_model, X_train, y_train, vectorizer):
    """Extracts and prints the top positive and negative feature drivers."""
    
    # Re-train the model if the provided one doesn't have coefficients attached
    if not hasattr(lr_model, 'coef_'):
        print("Model coefficients not found. Retraining Logistic Regression...")
        lr_model.fit(X_train, y_train)

    feature_names = vectorizer.get_feature_names_out()
    classes = lr_model.classes_
    coef = lr_model.coef_

    print("\n--- Sentiment Feature Importance Analysis (Logistic Regression) ---")

    # Positive Features
    positive_class_index = list(classes).index('Positive')
    top_positive_indices = coef[positive_class_index].argsort()[-10:][::-1]
    top_positive_words = [feature_names[i] for i in top_positive_indices]
    print(f"Top 10 Words Driving POSITIVE Sentiment: \n{', '.join(top_positive_words)}")
    

[Image of a feature importance plot showing positive vs negative coefficients in a linear model]


    # Negative Features
    negative_class_index = list(classes).index('Negative')
    top_negative_indices = coef[negative_class_index].argsort()[-10:][::-1]
    top_negative_words = [feature_names[i] for i in top_negative_indices]
    print(f"\nTop 10 Words Driving NEGATIVE Sentiment: \n{', '.join(top_negative_words)}")