# ESG Report Sentiment Analysis: Detecting Greenwashing and Industry Priorities

Install necessary packages and libraries

In [2]:
!pip install PyPDF2 nltk textblob pandas numpy matplotlib seaborn scikit-learn wordcloud vaderSentiment openpyxl

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp313-cp313-win_amd64.whl.metadata (3.5 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/624.3 kB ? eta -:--:--
   ---------------------------------------- 624.3/624.3 kB 3.6 MB/s eta 0:00:00
Downloading wordcloud-1.9.4-cp313-cp313-win_amd64.whl (300 kB)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: PyPDF2, vaderSentiment, wordcloud, textblob

   ---------------------------------------- 0/4 [PyPDF2]
   ---------------------------------------- 0/4 [

In [5]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# For PDF processing
import PyPDF2

# For NLP
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

# For Sentiment Analysis
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# For TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Downloading required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sonali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sonali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sonali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sonali\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sonali\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Simple function to grab synonyms from WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ').lower()
            synonyms.add(synonym)
    return synonyms

# Add synonyms to our keyword list
def expand_keywords_with_synonyms(keywords_list, max_synonyms_per_word=3):
    expanded = set(keywords_list)
    
    for keyword in keywords_list:
        synonyms = get_synonyms(keyword)
        for syn in list(synonyms)[:max_synonyms_per_word]:
            if len(syn) > 2:  # skip super short words
                expanded.add(syn)
    
    return sorted(list(expanded))

# Starting keywords for each ESG category
BASE_ESG_KEYWORDS = {
    'Environmental': [
        'climate', 'carbon', 'emission', 'renewable', 'energy', 
        'sustainability', 'environmental', 'waste', 'recycling', 'pollution',
        'biodiversity', 'ecosystem', 'water', 'greenhouse', 
        'deforestation', 'conservation', 'footprint',
        'sustainable', 'ecological', 'nature', 'ocean', 'forest', 'plastic',
        'solar', 'wind', 'fossil', 'green'
    ],
    'Social': [
        'employee', 'diversity', 'inclusion', 'equity', 'workforce',
        'labor', 'community', 'safety', 'health', 'wellbeing',
        'training', 'development', 'gender', 'equality', 'discrimination',
        'workplace', 'welfare', 'social', 'stakeholder', 'engagement',
        'philanthropy', 'volunteering', 'hiring', 'retention', 'culture',
        'benefits', 'human', 'rights'
    ],
    'Governance': [
        'governance', 'board', 'director', 'ethics', 'compliance',
        'transparency', 'accountability', 'audit', 'risk', 'management',
        'shareholder', 'executive', 'compensation', 'integrity',
        'corruption', 'bribery', 'policy', 'regulation', 'regulatory',
        'oversight', 'independent', 'committee', 'disclosure', 'reporting',
        'ethical', 'leadership'
    ]
}

# Expand keywords with synonyms
print("Expanding keywords with synonyms:\n")
ESG_KEYWORDS = {}

for pillar, keywords in BASE_ESG_KEYWORDS.items():
    expanded = expand_keywords_with_synonyms(keywords, max_synonyms_per_word=2)
    ESG_KEYWORDS[pillar] = expanded
    print(f"{pillar}: {len(keywords)} → {len(expanded)} keywords")

# Add multi-word phrases (WordNet doesn't handle these well)
ADDITIONAL_PHRASES = {
    'Environmental': [
        'renewable energy', 'clean energy', 'net zero', 'decarbonization',
        'circular economy', 'resource efficiency', 'climate change',
        'carbon footprint', 'carbon neutral', 'carbon dioxide', 'co2',
        'ghg emissions', 'global warming', 'environmental impact'
    ],
    'Social': [
        'human rights', 'customer satisfaction', 'supply chain', 'fair trade',
        'living wage', 'local communities', 'work life balance', 'employee engagement',
        'diversity and inclusion', 'pay equity', 'occupational health', 'labor rights',
        'community engagement', 'social responsibility', 'fair labor'
    ],
    'Governance': [
        'corporate governance', 'board of directors', 'risk management',
        'executive compensation', 'conflict of interest', 'code of conduct',
        'internal controls', 'compliance program', 'board independence',
        'shareholder rights', 'corporate ethics', 'whistleblower protection',
        'anti corruption', 'data privacy', 'cyber security'
    ]
}

for pillar, phrases in ADDITIONAL_PHRASES.items():
    ESG_KEYWORDS[pillar].extend(phrases)
    ESG_KEYWORDS[pillar] = sorted(list(set(ESG_KEYWORDS[pillar])))

print(f"\nFinal counts:")
for pillar, keywords in ESG_KEYWORDS.items():
    print(f"  {pillar}: {len(keywords)} keywords")

# Preview some keywords
print(f"\nSample keywords per pillar:")
for pillar, keywords in ESG_KEYWORDS.items():
    print(f"\n{pillar}:")
    print(f"  {', '.join(keywords[:15])}...")

# Words that signal vague commitments (possible greenwashing)
GREENWASHING_INDICATORS = [
    'committed', 'commitment', 'dedication', 'dedicated', 'passionate', 'leading',
    'strive', 'striving', 'world-class', 'best-in-class', 'innovative', 'excellence',
    'endeavor', 'endeavoring', 'working towards', 'aiming', 'planning', 'intend',
    'aspire', 'aspiring', 'believe', 'proud', 'excited', 'promising', 'exploring',
    'journey', 'vision', 'ambition', 'passionate about', 'hope', 'hoping', 'desire',
    'seek', 'seeking', 'continue to', 'ongoing', 'long term', 'future'
]

GREENWASHING_INDICATORS = expand_keywords_with_synonyms(GREENWASHING_INDICATORS, max_synonyms_per_word=2)
print(f"\nGreenwashing indicators: {len(GREENWASHING_INDICATORS)} terms")

# Words that signal concrete action
SUBSTANTIVE_WORDS = [
    'achieved', 'reduced', 'increased', 'implemented', 'completed', 'delivered',
    'measured', 'reported', 'certified', 'audited', 'verified', 'reached',
    'target', 'goal', 'metric', 'data', 'performance', 'result', 'outcome',
    'baseline', 'benchmark', 'kpi', 'indicator', 'quantified', 'tracked',
    'invested', 'spent', 'allocated', 'million', 'billion', 'percent', 'percentage',
    'launched', 'established', 'created', 'installed', 'deployed', 'executed'
]

SUBSTANTIVE_WORDS = expand_keywords_with_synonyms(SUBSTANTIVE_WORDS, max_synonyms_per_word=2)
print(f"Substantive action words: {len(SUBSTANTIVE_WORDS)} terms")

# Save to file for later reference
with open('expanded_esg_keywords.txt', 'w') as f:
    for pillar, keywords in ESG_KEYWORDS.items():
        f.write(f"\n{pillar.upper()} ({len(keywords)} keywords)\n")
        f.write("="*60 + "\n")
        f.write(', '.join(keywords) + '\n')
    
    f.write(f"\nGREENWASHING INDICATORS ({len(GREENWASHING_INDICATORS)} terms)\n")
    f.write("="*60 + "\n")
    f.write(', '.join(GREENWASHING_INDICATORS) + '\n')
    
    f.write(f"\nSUBSTANTIVE WORDS ({len(SUBSTANTIVE_WORDS)} terms)\n")
    f.write("="*60 + "\n")
    f.write(', '.join(SUBSTANTIVE_WORDS) + '\n')

print(f"\nKeywords saved to 'expanded_esg_keywords.txt'")

Expanding keywords with synonyms:

Environmental: 27 → 53 keywords
Social: 28 → 67 keywords
Governance: 26 → 62 keywords

Final counts:
  Environmental: 67 keywords
  Social: 82 keywords
  Governance: 77 keywords

Sample keywords per pillar:

Environmental:
  atomic number 6, biodiversity, bionomic, carbon, carbon dioxide, carbon footprint, carbon neutral, circular economy, clean energy, climate, climate change, clime, co2, conservation, contamination...

Social:
  benefit, benefits, booking, breeding, community, community engagement, comprehension, condom, culture, customer satisfaction, development, discrimination, diverseness, diversity, diversity and inclusion...

Governance:
  accountability, administrator, answerability, anti corruption, audit, board, board independence, board of directors, brass, bribery, citizens committee, code of conduct, committee, compensation, compliance...

Greenwashing indicators: 80 terms
Substantive action words: 94 terms

Keywords saved to 'expanded_e

In [9]:
# Extract text from PDF file
def extract_text_from_pdf(pdf_path):

    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += page_text + " "
        
        return text, num_pages
    
    except Exception as e:
        print(f"Error reading {pdf_path}: {str(e)}")
        return "", 0

# Clean up extracted text
def clean_text(text):
    # Fix spacing issues
    text = re.sub(r'\s+', ' ', text)
    # Keep letters, numbers, periods, commas, percent signs
    text = re.sub(r'[^\w\s\.\,\%]', ' ', text)
    # Remove standalone numbers (but keep percentages)
    text = re.sub(r'\b\d+\b', '', text)
    return text.strip()

In [10]:
# Analyze sentiment using VADER and TextBlob
def analyze_sentiment(text):  
    # VADER sentiment scores
    vader = SentimentIntensityAnalyzer()
    vader_scores = vader.polarity_scores(text)
    
    # TextBlob sentiment
    blob = TextBlob(text)
    textblob_polarity = blob.sentiment.polarity
    textblob_subjectivity = blob.sentiment.subjectivity
    
    # Sentence-level analysis
    sentences = sent_tokenize(text)
    
    if len(sentences) > 0:
        sentence_sentiments = [TextBlob(sent).sentiment.polarity for sent in sentences]
        positive_sentences = sum(1 for s in sentence_sentiments if s > 0.1)
        negative_sentences = sum(1 for s in sentence_sentiments if s < -0.1)
        neutral_sentences = len(sentences) - positive_sentences - negative_sentences
        avg_sentiment = np.mean(sentence_sentiments)
        sentiment_std = np.std(sentence_sentiments)
    else:
        sentence_sentiments = [0]
        positive_sentences = 0
        negative_sentences = 0
        neutral_sentences = 0
        avg_sentiment = 0
        sentiment_std = 0
    
    return {
        'vader_compound': vader_scores['compound'],
        'vader_positive': vader_scores['pos'],
        'vader_negative': vader_scores['neg'],
        'vader_neutral': vader_scores['neu'],
        'textblob_polarity': textblob_polarity,
        'textblob_subjectivity': textblob_subjectivity,
        'avg_sentence_sentiment': avg_sentiment,
        'sentence_sentiment_std': sentiment_std,
        'total_sentences': len(sentences),
        'positive_sentences': positive_sentences,
        'negative_sentences': negative_sentences,
        'neutral_sentences': neutral_sentences,
        'positive_sentences_pct': (positive_sentences / len(sentences) * 100) if len(sentences) > 0 else 0
    }

In [11]:
# Detect greenwashing by comparing aspirational vs concrete language
def detect_greenwashing(text):
   
    text_lower = text.lower()
    words = word_tokenize(text_lower)
    total_words = len(words)
    
    if total_words == 0:
        return {
            'greenwashing_indicators': 0,
            'substantive_words': 0,
            'greenwashing_density': 0,
            'substantive_density': 0,
            'greenwashing_ratio': 0,
            'risk_level': 'Unknown',
            'risk_score': 0
        }
    
    # Count vague/aspirational language
    greenwashing_count = 0
    for indicator in GREENWASHING_INDICATORS:
        greenwashing_count += text_lower.count(indicator)
    
    # Count concrete actions/metrics
    substantive_count = 0
    for word in SUBSTANTIVE_WORDS:
        substantive_count += text_lower.count(word)
    
    # Calculate density per 1000 words
    greenwashing_density = (greenwashing_count / total_words) * 1000
    substantive_density = (substantive_count / total_words) * 1000
    
    # Calculate ratio: high ratio = more fluff than substance
    if substantive_count > 0:
        greenwashing_ratio = greenwashing_count / substantive_count
    else:
        greenwashing_ratio = greenwashing_count if greenwashing_count > 0 else 0
    
    # Risk assessment
    risk_score = (greenwashing_ratio * 0.6) + (greenwashing_density * 0.4)
    
    if greenwashing_ratio > 1.5 or greenwashing_density > 15:
        risk_level = 'High'
    elif greenwashing_ratio > 0.8 or greenwashing_density > 8:
        risk_level = 'Medium'
    else:
        risk_level = 'Low'
    
    return {
        'greenwashing_indicators': greenwashing_count,
        'substantive_words': substantive_count,
        'greenwashing_density': round(greenwashing_density, 2),
        'substantive_density': round(substantive_density, 2),
        'greenwashing_ratio': round(greenwashing_ratio, 2),
        'risk_level': risk_level,
        'risk_score': round(risk_score, 2)
    }

In [13]:
# Calculate which ESG pillars the company focuses on
def calculate_esg_importance(text, company_name="Company"):
   
    text_lower = text.lower()
    
    # Count keyword mentions for each pillar
    esg_counts = {}
    for pillar, keywords in ESG_KEYWORDS.items():
        count = 0
        for keyword in keywords:
            count += text_lower.count(keyword.lower())
        esg_counts[pillar] = count
    
    total_keywords = sum(esg_counts.values())
    
    if total_keywords == 0:
        return {
            'Environmental_count': 0,
            'Social_count': 0,
            'Governance_count': 0,
            'Environmental_pct': 0,
            'Social_pct': 0,
            'Governance_pct': 0,
            'dominant_pillar': 'None',
            'total_esg_keywords': 0
        }
    
    # Calculate percentages
    env_pct = (esg_counts['Environmental'] / total_keywords) * 100
    soc_pct = (esg_counts['Social'] / total_keywords) * 100
    gov_pct = (esg_counts['Governance'] / total_keywords) * 100
    
    # Find dominant pillar
    dominant_pillar = max(esg_counts.items(), key=lambda x: x[1])[0]
    
    return {
        'Environmental_count': esg_counts['Environmental'],
        'Social_count': esg_counts['Social'],
        'Governance_count': esg_counts['Governance'],
        'Environmental_pct': round(env_pct, 2),
        'Social_pct': round(soc_pct, 2),
        'Governance_pct': round(gov_pct, 2),
        'dominant_pillar': dominant_pillar,
        'total_esg_keywords': total_keywords
    }

In [14]:
# Using TF-IDF to compare ESG focus across companies
def tfidf_esg_analysis(texts_dict):
   
    companies = list(texts_dict.keys())
    documents = list(texts_dict.values())
    
    results = []
    
    for pillar, keywords in ESG_KEYWORDS.items():
        # TF-IDF with ESG keywords as vocabulary
        vectorizer = TfidfVectorizer(
            vocabulary=keywords,
            lowercase=True,
            token_pattern=r'\b\w+\b'
        )
        
        try:
            tfidf_matrix = vectorizer.fit_transform(documents)
            
            # Get scores for each company
            for idx, company in enumerate(companies):
                doc_scores = tfidf_matrix[idx].toarray().flatten()
                mean_tfidf = np.mean(doc_scores) if len(doc_scores) > 0 else 0
                max_tfidf = np.max(doc_scores) if len(doc_scores) > 0 else 0
                
                results.append({
                    'company': company,
                    'pillar': pillar,
                    'tfidf_mean': round(mean_tfidf, 4),
                    'tfidf_max': round(max_tfidf, 4)
                })
        
        except Exception as e:
            print(f"TF-IDF issue for {pillar}: {e}")
            for company in companies:
                results.append({
                    'company': company,
                    'pillar': pillar,
                    'tfidf_mean': 0,
                    'tfidf_max': 0
                })
    
    return pd.DataFrame(results)

In [15]:
def analyze_single_esg_report(pdf_path, company_name, industry, controversy_level="Low"):
    """Run complete analysis on one ESG report"""
    
    print(f"\n{'='*60}")
    print(f"Analyzing: {company_name}")
    print(f"Industry: {industry} | Controversy: {controversy_level}")
    print(f"{'='*60}")
    
    # Extract text
    text, num_pages = extract_text_from_pdf(pdf_path)
    if not text or len(text) < 100:
        print("Failed to extract text")
        return None
    
    clean_text_content = clean_text(text)
    word_count = len(word_tokenize(clean_text_content))
    
    print(f"Extracted {len(text):,} characters from {num_pages} pages")
    print(f"Word count: {word_count:,}")
    
    # Sentiment analysis
    sentiment = analyze_sentiment(clean_text_content)
    print(f"\nSentiment:")
    print(f"  VADER: {sentiment['vader_compound']:.3f}")
    print(f"  TextBlob: {sentiment['textblob_polarity']:.3f}")
    print(f"  Positive sentences: {sentiment['positive_sentences_pct']:.1f}%")
    
    # Greenwashing detection
    greenwashing = detect_greenwashing(clean_text_content)
    print(f"\nGreenwashing:")
    print(f"  Risk: {greenwashing['risk_level']}")
    print(f"  Ratio: {greenwashing['greenwashing_ratio']:.2f}")
    print(f"  Aspirational words: {greenwashing['greenwashing_indicators']}")
    print(f"  Substantive words: {greenwashing['substantive_words']}")
    
    # ESG pillar analysis
    esg_importance = calculate_esg_importance(clean_text_content, company_name)
    print(f"\nESG Focus:")
    print(f"  Environmental: {esg_importance['Environmental_pct']:.1f}%")
    print(f"  Social: {esg_importance['Social_pct']:.1f}%")
    print(f"  Governance: {esg_importance['Governance_pct']:.1f}%")
    print(f"  Dominant: {esg_importance['dominant_pillar']}")
    
    # Compile results
    results = {
        'company_name': company_name,
        'industry': industry,
        'controversy_level': controversy_level,
        'num_pages': num_pages,
        'word_count': word_count,
        'text_content': clean_text_content,
    }
    
    results.update(sentiment)
    results.update(greenwashing)
    results.update(esg_importance)
    
    print(f"\nAnalysis complete for {company_name}\n")
    
    return results

In [16]:
#Analyze all ESG reports for one industry
def analyze_industry_reports(pdf_folder, industry_name, company_info):
   
    print(f"\n{'#'*60}")
    print(f"INDUSTRY: {industry_name}")
    print(f"{'#'*60}")
    
    all_results = []
    texts_for_tfidf = {}
    
    for filename, (company_name, controversy) in company_info.items():
        pdf_path = os.path.join(pdf_folder, filename)
        
        if not os.path.exists(pdf_path):
            print(f"File not found: {pdf_path}")
            continue
        
        # Analyze report
        result = analyze_single_esg_report(pdf_path, company_name, industry_name, controversy)
        
        if result:
            all_results.append(result)
            texts_for_tfidf[company_name] = result['text_content']
    
    # Create results dataframe
    df = pd.DataFrame(all_results)
    
    # TF-IDF analysis across companies
    if len(texts_for_tfidf) > 1:
        print(f"\nRunning TF-IDF analysis across {len(texts_for_tfidf)} companies...")
        tfidf_df = tfidf_esg_analysis(texts_for_tfidf)
    else:
        tfidf_df = pd.DataFrame()
    
    print(f"\n{'='*60}")
    print(f"Industry analysis complete: {industry_name}")
    print(f"Companies analyzed: {len(all_results)}")
    print(f"{'='*60}\n")
    
    return df, tfidf_df

In [18]:
# Analyze all ESG reports for one industry
def analyze_industry_reports(pdf_folder, industry_name, company_info):
   
    print(f"\n{'#'*60}")
    print(f"INDUSTRY: {industry_name}")
    print(f"{'#'*60}")
    
    all_results = []
    texts_for_tfidf = {}
    
    for filename, (company_name, controversy) in company_info.items():
        pdf_path = os.path.join(pdf_folder, filename)
        
        if not os.path.exists(pdf_path):
            print(f"File not found: {pdf_path}")
            continue
        
        # Analyze report
        result = analyze_single_esg_report(pdf_path, company_name, industry_name, controversy)
        
        if result:
            all_results.append(result)
            texts_for_tfidf[company_name] = result['text_content']
    
    # Create results dataframe
    df = pd.DataFrame(all_results)
    
    # TF-IDF analysis across companies
    if len(texts_for_tfidf) > 1:
        print(f"\nRunning TF-IDF analysis across {len(texts_for_tfidf)} companies...")
        tfidf_df = tfidf_esg_analysis(texts_for_tfidf)
    else:
        tfidf_df = pd.DataFrame()
    
    print(f"\n{'='*60}")
    print(f"Industry analysis complete: {industry_name}")
    print(f"Companies analyzed: {len(all_results)}")
    print(f"{'='*60}\n")
    
    return df, tfidf_df


In [None]:
# ============================================
# CONFIGURE YOUR PROJECT HERE
# ============================================

# Your industry name
INDUSTRY_NAME = "Fashion Retail"

# Path to folder with PDF files
PDF_FOLDER = "./Fashion Retail"  # Update this to your actual path

# Map PDF filenames to (company_name, controversy_level)
# controversy_level: "Low", "Medium", or "High"
COMPANY_INFO = {
    "Aritzia.pdf": ("Aritzia", "Low"),
    "FastRetailing.pdf": ("Fast Retailing (Uniqlo)", "Medium"),
    "GAPInc.pdf": ("GAP Inc", "Medium"),
    "H&M.pdf": ("H&M", "High"),
    "Inditex.pdf": ("Inditex (Zara)", "Medium"),
    "Levis.pdf": ("Levi's", "Low"),
    "LMVH.pdf": ("LVMH", "Low"),
    "Lululemon.pdf": ("Lululemon", "Low"),
    "M&S.pdf": ("Marks & Spencer", "Low"),
    "Patagonia.pdf": ("Patagonia", "Low"),
    "Prada.pdf": ("Prada", "Low"),
    "RalphLauren.pdf": ("Ralph Lauren", "Low"),
    "Shein.pdf": ("Shein", "High"),
    "TJX.pdf": ("TJX Companies", "Medium"),
    "VSCo.pdf": ("Victoria's Secret", "Medium"),
}

print("Configuration:")
print(f"  Industry: {INDUSTRY_NAME}")
print(f"  PDF folder: {PDF_FOLDER}")
print(f"  Companies: {len(COMPANY_INFO)}")
print(f"\nCompanies to analyze:")
for filename, (company, controversy) in COMPANY_INFO.items():
    print(f"  - {company} ({controversy} controversy)")

In [None]:
# Run complete analysis
results_df, tfidf_df = analyze_industry_reports(PDF_FOLDER, INDUSTRY_NAME, COMPANY_INFO)

# Show results preview
print("\n" + "="*60)
print("RESULTS PREVIEW")
print("="*60)
display(results_df.head())

# Show key columns
print("\nKey Metrics:")
key_cols = ['company_name', 'controversy_level', 'vader_compound', 
            'risk_level', 'greenwashing_ratio', 'dominant_pillar']
display(results_df[key_cols])