In [None]:
import transformers
transformers.logging.set_verbosity_error()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
import torch
from textblob import TextBlob
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import time

from nltk.tokenize import sent_tokenize as _nltk_sent_tokenize
def safe_sent_tokenize(text: str):
    try:
        return _nltk_sent_tokenize(text)
    except LookupError:
        for pkg in ("punkt_tab", "punkt"):
            try:
                nltk.download(pkg, quiet=True)
            except:
                pass
        try:
            return _nltk_sent_tokenize(text)
        except Exception:
            return re.split(r'(?<=[.!?])\s+', text.strip())

try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('averaged_perceptron_tagger_eng', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    pass

print("Advanced Multi-Model Employee Review Analysis Pipeline Initialized")

print("Phase 1: Loading and Exploring New Data Structure")

def load_and_explore_data(file_path):
    """Load and explore the new data structure"""
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully: {len(df)} reviews")
        print(f"Companies: {df['Company'].nunique()}")
        print(f"Date range: {df['As Of Date'].min()} to {df['As Of Date'].max()}")
        print(f"Countries: {df['Author Country'].nunique()}")
        
        print(f"\nColumns ({len(df.columns)}):")
        for i, col in enumerate(df.columns):
            print(f"  {i+1:2d}. {col}")
        
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


In [None]:
print("\nPhase 2: Enhanced HCF Framework & Dimension-Specific Analysis")

class EnhancedHCFAnalyzer:
    def __init__(self):
        self.hcf_dimensions = {
            "Direct_Management": {
                "description": """A supervisor-subordinate relationship is inherently hierarchal, which also
                means that it is inherently complex with multiple potential points of failure. Getting instructions
                about what to do and how to do it are never easy as it decreases the sense of autonomy in
                multiple ways. In this dimension we assess the positive qualities of these relationships with an
                emphasis on the working relationship, the support of employee development and on general
                caring for the employee. In general, positive manager-employee relationships encourages
                productivity and collaboration among teams. When there's mutual respect, care, and
                communications between a manager and an employee, there's more willingness on both ends to
                offer support and perform well.""",
                "rating_columns": ["Rating: Senior Management"],
                "text_columns": ["PROs", "CONs", "Advice to Management"],
                "keywords": ["management", "supervisor", "leadership", "boss", "manager", "director", "team lead", 
                           "executive", "1:1", "performance review", "feedback", "guidance", "support", "mentoring"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["supervision quality", "employee development support", "manager-employee relationships", 
                              "communication effectiveness", "decision-making processes", "career guidance"]
            },
            "Organizational_Alignment": {
                "description": """Organizational alignment is a shared understanding of the positive
                mission, philosophy and approaches that underlie the path and methods of any company. It
                allows all members of an organization, from entry-level positions to executive managers, to
                share common goals and vision for the organization and to be proud of their joint mission. In
                this dimension we assess the connection employees have with the values of the company, the
                sense of mission, and the meaning that they get from their workplace. In general, good
                organizational alignment helps with both motivation toward common organizational goals, a
                shared understanding of how to treat each other to accomplish goals, and the coordination of all
                actions and actors toward that goal.""",
                "rating_columns": ["Rating: Culture & Values"],
                "text_columns": ["PROs", "CONs", "Summary"],
                "keywords": ["mission", "vision", "goals", "strategy", "alignment", "purpose", "direction", 
                           "objectives", "values", "philosophy", "approach", "company culture", "shared vision"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["company mission clarity", "strategic alignment", "shared values understanding", 
                              "organizational goals", "company direction", "cultural philosophy"]
            },
            "Engagement": {
                "description": """Engagement is the level of commitment and emotional investment that employees
                have toward their job and their organization. Engagement is not just about job satisfaction or
                happiness, it is also about the level of involvement with their job, colleagues, and organization.
                Engaged employees are enthusiastic about their work, are willing to go above and beyond what
                is expected of them and are more likely to be loyal to their employer. In this dimension we
                assess motivation, connection with the company, and the ability of the employee to reach their
                potential to address the difficulties present in any job. In general, high employee engagement is
                crucial for creating a positive and productive workplace culture that fosters growth and success
                for both employees and the company.""",
                "rating_columns": ["Rating: Overall"],
                "text_columns": ["PROs", "CONs", "Summary"],
                "keywords": ["engagement", "motivation", "commitment", "involvement", "passion", "enthusiasm", 
                           "dedication", "satisfaction", "fulfillment", "excitement", "job satisfaction"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["work motivation", "job satisfaction", "employee involvement", "career commitment", 
                              "daily excitement", "emotional investment"]
            },
            "Innovation": {
                "description": """Innovation is the process of creating something new or improved. It involves
                finding, testing, and improving novel and creative solutions to problems or challenges. It also
                involves developing new ideas, products, or services. Innovation is an important driver of
                progress and growth for all companies. In this dimension we assess the company's approach to
                innovation and the degree to which it implicitly and explicitly encourages or discourages it, from
                a company's treatment of mistakes to the outright acceptance of new ideas. In general, the more
                innovative a company is, the more likely it is to be successful over time.""",
                "rating_columns": ["Rating: Overall"],
                "text_columns": ["PROs", "CONs", "Summary"],
                "keywords": ["innovation", "creative", "experimental", "new ideas", "cutting-edge", "technology", 
                           "research", "experimentation", "breakthrough", "inventive", "novel solutions"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["creativity encouragement", "new technology development", "experimentation support", 
                              "forward-thinking culture", "research projects", "mistake tolerance"]
            },
            "Organizational_Effectiveness": {
                "description": """Organizational effectiveness is the ability of a group to achieve
                its goals and objectives efficiently and with little waste (inputs, effort, time, energy, attention).
                Organizational effectiveness is a reflection of how well an organization uses its human and non-human
                resources to achieve its mission and objectives. In this dimension we assess whether
                people are given the tools they need for their jobs, the degree of collaboration, and the level of
                bureaucratic burden. In general, organizations with a high degree of organizational
                effectiveness are like well-oiled machines, providing a high level of output with inputs available.""",
                "rating_columns": ["Rating: Work/Life Balance"],
                "text_columns": ["PROs", "CONs", "Summary"],
                "keywords": ["efficiency", "productivity", "process", "workflow", "tools", "systems", "organization", 
                           "work-life balance", "flexibility", "resources", "collaboration", "bureaucracy"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["operational efficiency", "work processes", "resource management", "productivity tools", 
                              "work-life balance", "bureaucratic burden"]
            },
            "Emotional_Connection": {
                "description": """Emotional connection is the feeling of being emotionally attached,
                invested, and engaged with the workplace and co-workers. When employees feel emotionally
                connected, they are more likely to invest time, energy, and resources. Emotional connection can
                also foster trust, loyalty, and commitment. In this dimension we assess the desire to stay at the
                workplace, see their futures as intertwined, and the desire to recommend the workplace to others.
                In general, high emotional connection brings a sense of connection, satisfaction, productivity,
                purpose, and fulfillment.""",
                "rating_columns": ["Rating: Culture & Values"],
                "text_columns": ["PROs", "CONs", "Summary"],
                "keywords": ["culture", "values", "pride", "loyalty", "belonging", "family", "community", 
                           "atmosphere", "amazing", "love", "passion", "excitement", "emotional attachment"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["company pride", "emotional attachment", "workplace satisfaction", "team bonding", 
                              "company culture", "future commitment"]
            },
            "Extrinsic_Rewards": {
                "description": """Extrinsic rewards refer to rewards that are external to an individual such as
                money, prizes, or recognition. These rewards are intended to motivate the individual to perform
                better or continue engaging in a certain behavior, but do they?
                Extrinsic rewards have been shown to be effective in motivating individuals in the short term.
                However, extrinsic rewards have some limitations and are often ineffective in motivating
                individuals in the long term, as they do not address intrinsic motivation or interest in the work
                itself. In some cases, extrinsic rewards can have a negative effect on intrinsic motivation as
                individuals become over-focused on the reward itself. In this dimension we assess
                compensation, opportunities for advancement, and benefits. In general, when we think about
                rewards, we usually think about a set of extrinsic rewards.""",
                "rating_columns": ["Rating: Comp & Benefits", "Rating: Career Opportunities"],
                "text_columns": ["PROs", "CONs", "Summary"],
                "keywords": ["salary", "benefits", "compensation", "bonus", "promotion", "career", "pay", 
                           "rewards", "perks", "stock", "advancement", "growth", "recognition"],
                "sentiment_threshold": 0.1,
                "focus_areas": ["compensation structure", "benefits package", "career growth opportunities", 
                              "financial rewards", "promotion pathways", "recognition systems"]
            }
        }
        
        self.models = self._initialize_models()
        
    def _initialize_models(self):
        """Initialize different summarization and analysis models"""
        models = {}
        
        try:
            models['bart'] = pipeline("summarization", model="facebook/bart-large-cnn")
            print("BART model loaded")
        except:
            print("BART model not available")
            
        try:
            models['distilbart'] = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
            print("DistilBART model loaded")
        except:
            print("DistilBART model not available")
            
        try:
            models['t5'] = pipeline("summarization", model="t5-base")
            print("T5 model loaded")
        except:
            print("T5 model not available")
            
        try:
            models['sentence_bert'] = SentenceTransformer('all-MiniLM-L6-v2')
            print("✅ Sentence-BERT model loaded")
        except:
            print("⚠️ Sentence-BERT model not available")
            
        return models
    
    def analyze_company_dimensions(self, company_data):
        """Analyze a company across all HCF dimensions"""
        analysis = {}
        
        for dimension, config in self.hcf_dimensions.items():
            dimension_analysis = self._analyze_single_dimension(company_data, dimension, config)
            analysis[dimension] = dimension_analysis
            
        return analysis
    
    def _analyze_single_dimension(self, company_data, dimension, config):
        """Analyze a single HCF dimension for a company"""
        analysis = {
            'dimension': dimension,
            'description': config['description'],
            'rating_scores': [],
            'text_analysis': {},
            'summary': '',
            'sentiment': 'neutral',
            'strength_score': 0.0,
            'dimension_specific_insights': [],
            'model_comparison': {} 
        }
        
        for rating_col in config['rating_columns']:
            if rating_col in company_data.columns:
                ratings = company_data[rating_col].dropna()
                if len(ratings) > 0:
                    analysis['rating_scores'].extend(ratings.tolist())
        
        for text_col in config['text_columns']:
            if text_col in company_data.columns:
                text_data = company_data[text_col].dropna()
                if len(text_data) > 0:
                    text_analysis = self._analyze_text_content_dimension_specific(text_data, config)
                    analysis['text_analysis'][text_col] = text_analysis
        
        analysis['summary'], analysis['model_comparison'] = self._generate_dimension_specific_summary_with_comparison(company_data, dimension, config)
        
        analysis['strength_score'] = self._calculate_strength_score(analysis)
        analysis['sentiment'] = self._classify_sentiment(analysis['strength_score'])
        
        return analysis
    
    def _analyze_text_content_dimension_specific(self, text_data, config):
        """Analyze text content with dimension-specific focus"""
        analysis = {
            'keyword_matches': [],
            'sentiment_scores': [],
            'text_length': 0,
            'key_themes': [],
            'dimension_relevant_sentences': [],
            'focus_area_coverage': {}
        }
        
        for text in text_data:
            if pd.notna(text) and isinstance(text, str):
                text_lower = text.lower()
                for keyword in config['keywords']:
                    if keyword.lower() in text_lower:
                        analysis['keyword_matches'].append(keyword)
                
                for focus_area in config['focus_areas']:
                    if focus_area.lower() in text_lower:
                        if focus_area not in analysis['focus_area_coverage']:
                            analysis['focus_area_coverage'][focus_area] = []
                        analysis['focus_area_coverage'][focus_area].append(text)
                
                blob = TextBlob(text)
                analysis['sentiment_scores'].append(blob.sentiment.polarity)
                
                analysis['text_length'] += len(text)
                
                relevance_score = self._calculate_text_relevance(text, config)
                if relevance_score > 0.2:
                    analysis['dimension_relevant_sentences'].append((text, relevance_score))
        
        if analysis['sentiment_scores']:
            analysis['key_themes'] = self._extract_dimension_specific_themes(text_data, config)
            
        return analysis
    
    def _calculate_text_relevance(self, text, config):
        """Calculate how relevant a text is to a specific dimension"""
        text_lower = text.lower()
        relevance_score = 0.0
        
        keyword_matches = sum(1 for keyword in config['keywords'] if keyword.lower() in text_lower)
        relevance_score += (keyword_matches / len(config['keywords'])) * 0.5
        
        focus_matches = sum(1 for focus in config['focus_areas'] if focus.lower() in text_lower)
        relevance_score += (focus_matches / len(config['focus_areas'])) * 0.3
        
        length_score = min(len(text) / 200.0, 1.0)
        relevance_score += length_score * 0.2
        
        return relevance_score
    
    def _extract_dimension_specific_themes(self, text_data, config):
        """Extract themes specific to a particular dimension"""
        all_text = " ".join([str(t) for t in text_data if pd.notna(t)])
        words = word_tokenize(all_text.lower())
        
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word.isalnum() and len(word) > 3 and word not in stop_words]
        
        dimension_related_words = []
        for word in words:
            is_related = any(keyword.lower() in word.lower() or word.lower() in keyword.lower() 
                           for keyword in config['keywords'] + config['focus_areas'])
            if is_related:
                dimension_related_words.extend([word] * 3)
            else:
                dimension_related_words.append(word)
        
        word_freq = {}
        for word in dimension_related_words:
            word_freq[word] = word_freq.get(word, 0) + 1
        
        sorted_themes = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        return [theme[0] for theme in sorted_themes[:10]]
    
    def _generate_dimension_specific_summary_with_comparison(self, company_data, dimension, config):
        """Generate dimension-specific summary using multiple approaches with model comparison"""
        summaries = []
        model_comparison = {}
        
        dimension_text = self._extract_dimension_specific_text(company_data, dimension, config)
        
        if not dimension_text.strip():
            return f"No dimension-specific data available for {dimension}", {}
        
        focused_summary = self._create_focused_extractive_summary(dimension_text, dimension, config)
        summaries.append(f"Focused: {focused_summary}")
        
        if len(dimension_text) > 100:
            model_comparison = self._compare_summarization_models(dimension_text, dimension, config)
            if model_comparison:
                for model_name, result in model_comparison.items():
                    if result.get('summary'):
                        summaries.append(f"{model_name}: {result['summary']}")
        
        keyword_summary = self._create_keyword_based_summary(dimension_text, dimension, config)
        summaries.append(f"Keyword-based: {keyword_summary}")
        
        return " | ".join(summaries), model_comparison
    
    def _compare_summarization_models(self, text, dimension, config):
        """Compare different summarization models"""
        comparison = {}
        
        prompt = f"Summarize employee feedback about {dimension.replace('_',' ').lower()} focusing on: {config['description'][:200]}... "
        input_text = prompt + text[:1024]
        
        max_len = min(80, len(input_text.split()) // 2)
        max_len = max(max_len, 20)
        
        if 'bart' in self.models:
            try:
                start_time = time.time()
                bart_summary = self.models['bart'](
                    input_text,
                    max_length=max_len,
                    min_length=20
                )[0]['summary_text']
                bart_time = time.time() - start_time
                
                comparison['BART'] = {
                    'summary': bart_summary,
                    'time': bart_time,
                    'input_length': len(input_text.split()),
                    'output_length': len(bart_summary.split()),
                    'compression_ratio': len(bart_summary.split()) / max(1, len(input_text.split()))
                }
            except Exception as e:
                comparison['BART'] = {'error': str(e)}
        
        if 'distilbart' in self.models:
            try:
                start_time = time.time()
                distilbart_summary = self.models['distilbart'](
                    input_text,
                    max_length=max_len,
                    min_length=20
                )[0]['summary_text']
                distilbart_time = time.time() - start_time
                
                comparison['DistilBART'] = {
                    'summary': distilbart_summary,
                    'time': distilbart_time,
                    'input_length': len(input_text.split()),
                    'output_length': len(distilbart_summary.split()),
                    'compression_ratio': len(distilbart_summary.split()) / max(1, len(input_text.split()))
                }
            except Exception as e:
                comparison['DistilBART'] = {'error': str(e)}
        
        if 't5' in self.models:
            try:
                start_time = time.time()
                t5_summary = self.models['t5'](
                    input_text,
                    max_length=max_len,
                    min_length=20
                )[0]['summary_text']
                t5_time = time.time() - start_time
                
                comparison['T5'] = {
                    'summary': t5_summary,
                    'time': t5_time,
                    'input_length': len(input_text.split()),
                    'output_length': len(t5_summary.split()),
                    'compression_ratio': len(t5_summary.split()) / max(1, len(input_text.split()))
                }
            except Exception as e:
                comparison['T5'] = {'error': str(e)}
        
        return comparison
    
    def _extract_dimension_specific_text(self, company_data, dimension, config):
        """Extract text that is most relevant to a specific dimension"""
        relevant_texts = []
        
        for text_col in config['text_columns']:
            if text_col in company_data.columns:
                text_data = company_data[text_col].dropna()
                for text in text_data:
                    if pd.notna(text) and isinstance(text, str):
                        relevance = self._calculate_text_relevance(text, config)
                        if relevance > 0.1:
                            relevant_texts.append((text, relevance))
        
        relevant_texts.sort(key=lambda x: x[1], reverse=True)
        top_texts = relevant_texts[:8]
        
        return " ".join([text[0] for text in top_texts])
    
    def _create_focused_extractive_summary(self, text, dimension, config):
        """Create extractive summary focused on dimension-specific content"""
        sentences = safe_sent_tokenize(text)
        dimension_sentences = []
        
        for sentence in sentences:
            s_low = sentence.lower()
            keyword_count = sum(1 for keyword in config['keywords'] if keyword.lower() in s_low)
            focus_count   = sum(1 for focus in config['focus_areas'] if focus.lower() in s_low)
            relevance_score = keyword_count + focus_count
            
            if relevance_score > 0:
                dimension_sentences.append((sentence.strip(), relevance_score))
        
        dimension_sentences.sort(key=lambda x: x[1], reverse=True)
        top_sentences = [s for s, _ in dimension_sentences[:4]]
        
        if not top_sentences:
            return f"Limited {dimension.replace('_',' ').lower()} specific feedback available"
        
        return ". ".join(top_sentences)
    
    def _create_keyword_based_summary(self, text, dimension, config):
        """Create summary based on dimension keywords and focus areas"""
        sentences = safe_sent_tokenize(text)
        keyword_sentences = []
        
        for sentence in sentences:
            s_low = sentence.lower()
            keywords_found = [keyword for keyword in config['keywords'] if keyword.lower() in s_low]
            focus_found    = [focus for focus in config['focus_areas'] if focus.lower() in s_low]
            
            if keywords_found or focus_found:
                keyword_sentences.append((sentence.strip(), len(keywords_found) + len(focus_found)))
        
        if not keyword_sentences:
            return f"No {dimension.replace('_',' ').lower()} specific content identified"
        
        keyword_sentences.sort(key=lambda x: x[1], reverse=True)
        top_sentences = [s for s, _ in keyword_sentences[:3]]
        
        return ". ".join(top_sentences)
    
    def _calculate_strength_score(self, analysis):
        """Calculate overall strength score for a dimension"""
        score = 0.0
        
        if analysis['rating_scores']:
            avg_rating = np.mean(analysis['rating_scores'])
            score += (avg_rating / 5.0) * 0.4
        
        if analysis['text_analysis']:
            all_sentiments = []
            for text_analysis in analysis['text_analysis'].values():
                if text_analysis['sentiment_scores']:
                    all_sentiments.extend(text_analysis['sentiment_scores'])
            
            if all_sentiments:
                avg_sentiment = np.mean(all_sentiments)
                score += (avg_sentiment + 1) * 0.15
        
        total_relevant_content = sum(len(ta['dimension_relevant_sentences']) for ta in analysis['text_analysis'].values())
        content_score = min(total_relevant_content / 5.0, 1.0)
        score += content_score * 0.3
        
        return score
    
    def _classify_sentiment(self, score):
        """Classify sentiment based on strength score"""
        if score >= 0.7:
            return 'positive'
        elif score <= 0.3:
            return 'negative'
        else:
            return 'neutral'


In [None]:
print("\nPhase 3: Company Analysis and Reporting")

class CompanyAnalyzer:
    def __init__(self, hcf_analyzer):
        self.hcf_analyzer = hcf_analyzer
    
    def analyze_all_companies(self, df):
        """Analyze all companies in the dataset"""
        companies = df['Company'].unique()
        company_analyses = {}
        
        print(f"Analyzing {len(companies)} companies...")
        
        for i, company in enumerate(companies, 1):
            print(f"  {i}/{len(companies)}: Analyzing {company}")
            company_data = df[df['Company'] == company]
            company_analysis = self.hcf_analyzer.analyze_company_dimensions(company_data)
            company_analyses[company] = company_analysis
        
        return company_analyses
    
    def generate_company_report(self, company_name, company_analysis):
        """Generate comprehensive report for a single company"""
        report = f"\nCOMPANY ANALYSIS REPORT: {company_name}\n"
        report += "=" * 80 + "\n"
        
        overall_scores = [analysis['strength_score'] for analysis in company_analysis.values()]
        overall_score = np.mean(overall_scores)
        overall_sentiment = self.hcf_analyzer._classify_sentiment(overall_score)
        
        report += f" Overall COMPANY SCORE: {overall_score:.3f} ({overall_sentiment.upper()})\n\n"
        
        for dimension, analysis in company_analysis.items():
            report += f"   Score: {analysis['strength_score']:.3f} ({analysis['sentiment']})\n"
            report += f"   Description: {analysis['description'][:100]}...\n"
            
            if analysis['rating_scores']:
                report += f"   Rating Average: {np.mean(analysis['rating_scores']):.2f}/5.0\n"
            
            if analysis['summary']:
                report += f"   Summary: {analysis['summary']}\n"
            
            if analysis['model_comparison']:
                report += f"   Model Comparison:\n"
                for model_name, result in analysis['model_comparison'].items():
                    if 'error' not in result:
                        report += f"     {model_name}: {result['time']:.3f}s, {result['compression_ratio']:.2f} ratio\n"
                    else:
                        report += f"     {model_name}: Error - {result['error']}\n"
            
            if analysis['text_analysis']:
                total_relevant = sum(len(ta['dimension_relevant_sentences']) for ta in analysis['text_analysis'].values())
                report += f"   Relevant Content: {total_relevant} sentences\n"
            
            report += "\n"
        
        return report
    
    def generate_model_comparison_summary(self, company_analyses):
        """Generate summary of model performance across all companies"""
        print("\nMODEL COMPARISON SUMMARY:")
        print("=" * 50)
        
        model_stats = {}
        
        for company_name, company_analysis in company_analyses.items():
            for dimension, analysis in company_analysis.items():
                if 'model_comparison' in analysis and analysis['model_comparison']:
                    for model_name, result in analysis['model_comparison'].items():
                        if 'error' not in result:
                            if model_name not in model_stats:
                                model_stats[model_name] = {
                                    'total_time': 0,
                                    'total_compression': 0,
                                    'count': 0,
                                    'errors': 0
                                }
                            
                            model_stats[model_name]['total_time'] += result['time']
                            model_stats[model_name]['total_compression'] += result['compression_ratio']
                            model_stats[model_name]['count'] += 1
                        else:
                            if model_name not in model_stats:
                                model_stats[model_name] = {'errors': 0, 'count': 0}
                            model_stats[model_name]['errors'] += 1
        
        # Display model performance summary
        for model_name, stats in model_stats.items():
            if 'count' in stats and stats['count'] > 0:
                avg_time = stats['total_time'] / stats['count']
                avg_compression = stats['total_compression'] / stats['count']
                print(f"\n{model_name}:")
                print(f"   Average Time: {avg_time:.3f}s")
                print(f"   Average Compression: {avg_compression:.2f}")
                print(f"   Success Rate: {stats['count']}/{stats['count'] + stats.get('errors', 0)}")
            else:
                print(f"\n{model_name}: No successful runs")



In [None]:
print("\nPhase 4: Main Execution Pipeline")

def main():
    """Main execution function"""
    
    data_paths = [
        '/content/drive/MyDrive/finM/textData2/gd_sample_sy2017.csv',
        '/content/drive/MyDrive/finM/textData2/gd_sample_sy2018.csv',
        '/content/drive/MyDrive/finM/textData2/gd_sample_sy2019.csv',
        '/content/drive/MyDrive/finM/textData2/gd_sample_sy2020.csv'
    ]
    
    df = None
    for path in data_paths:
        try:
            print(f"🔍 Trying to load data from: {path}")
            df = load_and_explore_data(path)
            if df is not None:
                print(f"Successfully loaded data from: {path}")
                break
        except:
            continue
    
    if df is None:
        print("Could not load data from any expected location")
        print(" Please ensure your data file is in the correct path")
        return
    
    print("\nInitializing HCF Analyzer...")
    hcf_analyzer = EnhancedHCFAnalyzer()
    
    company_analyzer = CompanyAnalyzer(hcf_analyzer)
    
    print("\nStarting company analysis...")
    company_analyses = company_analyzer.analyze_all_companies(df)
    
    print("\nGenerating company reports...")
    top_companies = sorted(
        company_analyses.keys(),
        key=lambda x: np.mean([a['strength_score'] for a in company_analyses[x].values()]),
        reverse=True
    )[:3]
    
    for company in top_companies:
        report = company_analyzer.generate_company_report(company, company_analyses[company])
        print(report)
    
    company_analyzer.generate_model_comparison_summary(company_analyses)
    
    print("\nANALYSIS SUMMARY:")
    print("=" * 40)
    print(f"Total Companies Analyzed: {len(company_analyses)}")
    print(f"Total Reviews Processed: {len(df)}")
    
    print("\nDIMENSION PERFORMANCE SUMMARY:")
    print("-" * 40)
    for dimension in hcf_analyzer.hcf_dimensions.keys():
        dimension_scores = []
        for company_analysis in company_analyses.values():
            if dimension in company_analysis:
                dimension_scores.append(company_analysis[dimension]['strength_score'])
        
        if dimension_scores:
            avg_score = np.mean(dimension_scores)
            print(f"{dimension}: {avg_score:.3f}")
    
    print("\nKey Improvements with Model Comparison:")
    print("• Added DistilBART for faster inference (50% smaller, ~60% faster)")
    print("• Implemented comprehensive model comparison system")
    print("• Performance metrics: time, compression ratio, success rate")
    print("• Side-by-side summary quality comparison")
    print("• Error handling and model availability checking")

if __name__ == "__main__":
    main()