**IMPORTS AND SETUP**

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import re
import json
from collections import Counter
import warnings
import os
import chardet

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

**DETECT FILE ENCODING**

In [7]:
print("üîç Detecting file encoding...")

# First, let's detect the encoding
with open('all-data.csv', 'rb') as f:
    raw_data = f.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    confidence = result['confidence']
    
print(f"   Detected encoding: {encoding} (confidence: {confidence:.2%})")
print(f"   File size: {len(raw_data)} bytes")

# Try different encodings if needed
encodings_to_try = [encoding, 'latin-1', 'ISO-8859-1', 'cp1252', 'utf-8']

for enc in encodings_to_try:
    try:
        print(f"\nTrying encoding: {enc}")
        df = pd.read_csv('all-data.csv', names=['sentiment', 'text'], encoding=enc)
        print(f"   Success! Loaded {len(df)} rows")
        break
    except Exception as e:
        print(f"   Failed: {str(e)[:100]}...")

üîç Detecting file encoding...
   Detected encoding: Windows-1252 (confidence: 73.00%)
   File size: 672006 bytes

Trying encoding: Windows-1252
   Success! Loaded 4846 rows


**DATA LOADING WITH PROPER ENCODING**

In [8]:
print("\nüìä Loading housing sentiment data...")

# Use latin-1 which handles most Western European characters
try:
    df = pd.read_csv('all-data.csv', names=['sentiment', 'text'], encoding='latin-1')
    print(f"‚úÖ Dataset loaded successfully with latin-1 encoding!")
except:
    try:
        df = pd.read_csv('all-data.csv', names=['sentiment', 'text'], encoding='ISO-8859-1')
        print(f"‚úÖ Dataset loaded successfully with ISO-8859-1 encoding!")
    except:
        df = pd.read_csv('all-data.csv', names=['sentiment', 'text'], encoding='utf-8', errors='ignore')
        print(f"‚úÖ Dataset loaded successfully with utf-8 encoding (errors ignored)!")

print(f"   Shape: {df.shape}")
print(f"   Columns: {df.columns.tolist()}")
print(f"\nFirst 3 rows:")
print(df.head(3))
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())


üìä Loading housing sentiment data...
‚úÖ Dataset loaded successfully with latin-1 encoding!
   Shape: (4846, 2)
   Columns: ['sentiment', 'text']

First 3 rows:
  sentiment                                               text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...

Sentiment distribution:
sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64


**DATA CLEANING**

In [9]:
def clean_text(text):
    """Clean text data"""
    if not isinstance(text, str):
        return ""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove quotes and special characters
    text = text.strip('"\'').strip()
    return text

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Map sentiment to numeric scores
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
df['sentiment_score'] = df['sentiment'].map(label_map)

print(f"‚úÖ Data cleaning completed!")
print(f"   Cleaned {len(df)} articles")

‚úÖ Data cleaning completed!
   Cleaned 4846 articles


**ADD TEMPORAL FEATURES**

In [10]:
# Create simulated dates (since dataset doesn't have real dates)
np.random.seed(42)
start_date = datetime(2018, 1, 1)
end_date = datetime(2020, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, periods=len(df))
df['date'] = date_range

# Add temporal features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.to_period('M')
df['week'] = df['date'].dt.to_period('W')
df['quarter'] = df['date'].dt.to_period('Q')
df['day_of_week'] = df['date'].dt.day_name()

print(f"‚úÖ Temporal features added!")
print(f"   Date range: {df['date'].min().date()} to {df['date'].max().date()}")

‚úÖ Temporal features added!
   Date range: 2018-01-01 to 2020-12-31


**HOUSING KEYWORD DETECTION**

In [11]:
# Define housing and finance keywords
housing_keywords = [
    'housing', 'mortgage', 'real estate', 'property', 'rent', 
    'apartment', 'house', 'construction', 'loan', 'interest rate',
    'investment', 'market', 'price', 'sale', 'buy', 'sell',
    'development', 'building', 'home', 'residential', 'commercial',
    'lease', 'tenant', 'landlord', 'finance', 'bank', 'credit',
    'equity', 'foreclosure', 'refinance', 'down payment', 'zoning',
    'affordable housing', 'rental', 'vacancy', 'appraisal', 'title',
    'closing', 'escrow', 'homeowner', 'condominium', 'townhouse'
]

# Detect housing-related articles
def contains_housing_keywords(text):
    """Check if text contains housing/finance keywords"""
    text_lower = text.lower()
    for keyword in housing_keywords:
        if keyword in text_lower:
            return True
    return False

df['contains_housing'] = df['cleaned_text'].apply(contains_housing_keywords)
df['housing_keywords'] = df['cleaned_text'].apply(
    lambda x: [k for k in housing_keywords if k in x.lower()]
)

# Count keyword occurrences
keyword_counts = {}
for keywords in df['housing_keywords']:
    for keyword in keywords:
        keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1

print(f"‚úÖ Housing keyword analysis completed!")
print(f"   Housing-related articles: {df['contains_housing'].sum()} "
      f"({df['contains_housing'].mean():.1%})")
print(f"\nTop 10 housing keywords:")
for keyword, count in sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"   {keyword}: {count}")

‚úÖ Housing keyword analysis completed!
   Housing-related articles: 1664 (34.3%)

Top 10 housing keywords:
   sale: 481
   market: 307
   rent: 141
   bank: 130
   investment: 124
   price: 107
   construction: 104
   lease: 99
   development: 92
   building: 84


**SENTIMENT DISTRIBUTION ANALYSIS**

In [14]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Overall Sentiment Distribution', 
                    'Sentiment by Housing Context',
                    'Monthly Sentiment Trend',
                    'Sentiment Composition Over Time'),
    specs=[[{'type': 'pie'}, {'type': 'bar'}],
           [{'type': 'scatter'}, {'type': 'scatter'}]]
)

# 1. Overall sentiment pie chart
sentiment_counts = df['sentiment'].value_counts()
fig.add_trace(
    go.Pie(labels=sentiment_counts.index, 
           values=sentiment_counts.values,
           marker_colors=['#4CAF50', '#FFC107', '#F44336'],
           name="Overall"),
    row=1, col=1
)

# 2. Sentiment by housing context
housing_sentiment = df[df['contains_housing']]['sentiment'].value_counts()
non_housing_sentiment = df[~df['contains_housing']]['sentiment'].value_counts()

fig.add_trace(
    go.Bar(x=housing_sentiment.index, y=housing_sentiment.values,
           name='Housing Articles', marker_color='#2196F3'),
    row=1, col=2
)

fig.add_trace(
    go.Bar(x=non_housing_sentiment.index, y=non_housing_sentiment.values,
           name='Non-Housing Articles', marker_color='#9E9E9E'),
    row=1, col=2
)

# 3. Monthly sentiment trend
monthly_sentiment = df.groupby('month')['sentiment_score'].mean().reset_index()
monthly_sentiment['month_date'] = monthly_sentiment['month'].dt.to_timestamp()

fig.add_trace(
    go.Scatter(x=monthly_sentiment['month_date'], y=monthly_sentiment['sentiment_score'],
               mode='lines+markers', name='Monthly HSI',
               line=dict(color='#E91E63', width=3),
               marker=dict(size=6)),
    row=2, col=1
)

# 4. Sentiment composition over time
weekly_sentiment = df.groupby('week').agg({
    'sentiment': lambda x: (x == 'positive').mean()
}).reset_index()
weekly_sentiment['week_date'] = weekly_sentiment['week'].dt.to_timestamp()

fig.add_trace(
    go.Scatter(x=weekly_sentiment['week_date'], y=weekly_sentiment['sentiment'],
               mode='lines', name='Positive Ratio',
               line=dict(color='#8BC34A', width=2),
               fill='tozeroy'),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=800,
    title_text="Housing Sentiment Analysis Dashboard",
    showlegend=True,
    template='plotly_white'
)

fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_xaxes(title_text="Date", row=2, col=2)
fig.update_yaxes(title_text="Sentiment Score", row=2, col=1)
fig.update_yaxes(title_text="Positive Ratio", row=2, col=2)

fig.show()

print("üìà Interactive dashboard generated!")

üìà Interactive dashboard generated!


**HOUSING SENTIMENT INDEX (HSI) CALCULATION**

In [15]:
# Calculate weekly HSI
weekly_hsi = df.groupby('week').agg({
    'sentiment_score': ['mean', 'std', 'count'],
    'sentiment': lambda x: {
        'positive': (x == 'positive').sum(),
        'neutral': (x == 'neutral').sum(),
        'negative': (x == 'negative').sum()
    }
}).reset_index()

weekly_hsi.columns = ['week', 'hsi_mean', 'hsi_std', 'article_count', 'sentiment_distribution']
weekly_hsi['week_date'] = weekly_hsi['week'].dt.to_timestamp()
weekly_hsi['hsi_rolling'] = weekly_hsi['hsi_mean'].rolling(window=4, center=True).mean()

# Calculate monthly HSI
monthly_hsi = df.groupby('month').agg({
    'sentiment_score': ['mean', 'std', 'count'],
    'contains_housing': 'sum'
}).reset_index()
monthly_hsi.columns = ['month', 'hsi_mean', 'hsi_std', 'article_count', 'housing_articles']
monthly_hsi['month_date'] = monthly_hsi['month'].dt.to_timestamp()

print(f"‚úÖ HSI calculated!")
print(f"   Weekly HSI points: {len(weekly_hsi)}")
print(f"   Monthly HSI points: {len(monthly_hsi)}")

‚úÖ HSI calculated!
   Weekly HSI points: 157
   Monthly HSI points: 36


**ADVANCED VISUALIZATION - HSI TRENDS**

In [17]:
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=('Housing Sentiment Index (HSI) - Weekly',
                    'HSI Volatility and Article Volume',
                    'Housing vs Non-Housing Sentiment'),
    vertical_spacing=0.1,
    row_heights=[0.4, 0.3, 0.3]
)

# 1. Weekly HSI with confidence intervals
fig.add_trace(
    go.Scatter(x=weekly_hsi['week_date'], y=weekly_hsi['hsi_mean'],
               mode='lines', name='Weekly HSI',
               line=dict(color='#2196F3', width=2),
               fillcolor='rgba(33, 150, 243, 0.3)',
               fill='tozeroy'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=weekly_hsi['week_date'], y=weekly_hsi['hsi_rolling'],
               mode='lines', name='4-Week Moving Avg',
               line=dict(color='#1976D2', width=3)),
    row=1, col=1
)

# Add confidence intervals
fig.add_trace(
    go.Scatter(
        x=weekly_hsi['week_date'],
        y=weekly_hsi['hsi_mean'] + weekly_hsi['hsi_std'],
        mode='lines',
        line=dict(width=0),
        showlegend=False,
        hoverinfo='skip'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=weekly_hsi['week_date'],
        y=weekly_hsi['hsi_mean'] - weekly_hsi['hsi_std'],
        mode='lines',
        line=dict(width=0),
        fillcolor='rgba(33, 150, 243, 0.2)',
        fill='tonexty',
        showlegend=False,
        hoverinfo='skip'
    ),
    row=1, col=1
)

# 2. Article volume and volatility
fig.add_trace(
    go.Bar(x=weekly_hsi['week_date'], y=weekly_hsi['article_count'],
           name='Article Count', marker_color='#607D8B'),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=weekly_hsi['week_date'], y=weekly_hsi['hsi_std'],
               mode='lines', name='HSI Volatility',
               line=dict(color='#F44336', width=2),
               yaxis='y2'),
    row=2, col=1
)

# 3. Housing vs non-housing sentiment
housing_weekly = df[df['contains_housing']].groupby('week')['sentiment_score'].mean().reset_index()
non_housing_weekly = df[~df['contains_housing']].groupby('week')['sentiment_score'].mean().reset_index()

housing_weekly['week_date'] = housing_weekly['week'].dt.to_timestamp()
non_housing_weekly['week_date'] = non_housing_weekly['week'].dt.to_timestamp()

fig.add_trace(
    go.Scatter(x=housing_weekly['week_date'], y=housing_weekly['sentiment_score'],
               mode='lines', name='Housing Articles',
               line=dict(color='#4CAF50', width=2)),
    row=3, col=1
)

fig.add_trace(
    go.Scatter(x=non_housing_weekly['week_date'], y=non_housing_weekly['sentiment_score'],
               mode='lines', name='Non-Housing Articles',
               line=dict(color='#9E9E9E', width=2)),
    row=3, col=1
)

# Update layout
fig.update_layout(
    height=1000,
    title_text="Advanced HSI Analysis Dashboard",
    showlegend=True,
    template='plotly_white',
    hovermode='x unified'
)

# Update axes
fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="HSI Score", row=1, col=1)
fig.update_yaxes(title_text="Article Count", row=2, col=1)
fig.update_yaxes(title_text="Volatility", row=2, col=1, secondary_y=True)
fig.update_yaxes(title_text="Sentiment Score", row=3, col=1)

fig.show()

**TOP ARTICLES ANALYSIS**

In [18]:
# Extract top positive and negative articles
top_positive = df.nlargest(10, 'sentiment_score')[['date', 'sentiment', 'cleaned_text', 'contains_housing']]
top_negative = df.nsmallest(10, 'sentiment_score')[['date', 'sentiment', 'cleaned_text', 'contains_housing']]

# Create DataFrames for display
top_articles_df = pd.concat([
    top_positive.assign(type='positive'),
    top_negative.assign(type='negative')
])

# Create interactive table
fig = go.Figure(data=[go.Table(
    header=dict(
        values=['Type', 'Date', 'Sentiment', 'Housing-Related', 'Text Preview'],
        fill_color='#2196F3',
        font=dict(color='white', size=12),
        align='left'
    ),
    cells=dict(
        values=[
            top_articles_df['type'],
            top_articles_df['date'].dt.date,
            top_articles_df['sentiment'],
            top_articles_df['contains_housing'].map({True: 'Yes', False: 'No'}),
            top_articles_df['cleaned_text'].str[:100] + '...'
        ],
        fill_color=['white', 'white', 'white', 'white', 'white'],
        align='left',
        font=dict(size=11)
    )
)])

fig.update_layout(
    title_text="Top Positive and Negative Articles",
    height=600
)

fig.show()

print("\nüìã Top Articles Summary:")
print(f"   Positive articles: {len(top_positive)}")
print(f"   Negative articles: {len(top_negative)}")
print(f"   Housing-related in top positive: {top_positive['contains_housing'].sum()}")
print(f"   Housing-related in top negative: {top_negative['contains_housing'].sum()}")


üìã Top Articles Summary:
   Positive articles: 10
   Negative articles: 10
   Housing-related in top positive: 6
   Housing-related in top negative: 5


**SEASONAL AND TREND ANALYSIS**

In [19]:
# Seasonal analysis
df['month_num'] = df['date'].dt.month
seasonal_analysis = df.groupby('month_num').agg({
    'sentiment_score': ['mean', 'std', 'count'],
    'contains_housing': 'mean'
}).reset_index()

seasonal_analysis.columns = ['month', 'sentiment_mean', 'sentiment_std', 
                             'article_count', 'housing_ratio']

# Create seasonal visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Monthly Sentiment Pattern',
                    'Housing Article Ratio by Month',
                    'Year-over-Year Comparison',
                    'Weekly Seasonality'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'heatmap'}]]
)

# 1. Monthly sentiment pattern
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

fig.add_trace(
    go.Bar(x=month_names, y=seasonal_analysis['sentiment_mean'],
           error_y=dict(type='data', array=seasonal_analysis['sentiment_std']),
           name='Monthly Sentiment',
           marker_color='#FF9800'),
    row=1, col=1
)

# 2. Housing article ratio by month
fig.add_trace(
    go.Bar(x=month_names, y=seasonal_analysis['housing_ratio'],
           name='Housing Article Ratio',
           marker_color='#4CAF50'),
    row=1, col=2
)

# 3. Year-over-year comparison
yearly_sentiment = df.groupby('year')['sentiment_score'].agg(['mean', 'count']).reset_index()

fig.add_trace(
    go.Bar(x=yearly_sentiment['year'].astype(str), 
           y=yearly_sentiment['mean'],
           name='Yearly Sentiment',
           marker_color=['#F44336', '#2196F3', '#4CAF50']),
    row=2, col=1
)

# Add article count as text
for i, row in yearly_sentiment.iterrows():
    fig.add_annotation(
        x=row['year'],
        y=row['mean'],
        text=f"{int(row['count'])}",
        showarrow=False,
        yshift=10,
        row=2, col=1
    )

# 4. Weekly seasonality heatmap
weekly_seasonality = df.groupby(['year', 'week'])['sentiment_score'].mean().reset_index()
weekly_pivot = weekly_seasonality.pivot(index='week', columns='year', values='sentiment_score')

fig.add_trace(
    go.Heatmap(z=weekly_pivot.values,
               x=weekly_pivot.columns.astype(str),
               y=[str(w) for w in weekly_pivot.index],
               colorscale='RdYlGn',
               name='Weekly HSI Heatmap'),
    row=2, col=2
)

fig.update_layout(
    height=800,
    title_text="Seasonal and Trend Analysis",
    showlegend=True,
    template='plotly_white'
)

fig.show()

**SAVE PROCESSED DATA FOR API**

In [20]:
# Prepare data for API
hsi_data = {
    'weekly': weekly_hsi[['week_date', 'hsi_mean', 'hsi_std', 'hsi_rolling', 'article_count']]
               .rename(columns={'week_date': 'date'})
               .to_dict('records'),
    'monthly': monthly_hsi[['month_date', 'hsi_mean', 'hsi_std', 'article_count', 'housing_articles']]
                .rename(columns={'month_date': 'date'})
                .to_dict('records'),
    'daily': df[['date', 'sentiment_score', 'sentiment', 'contains_housing', 'cleaned_text']]
              .rename(columns={'cleaned_text': 'text'})
              .head(1000)  # Limit for API
              .to_dict('records'),
    'keywords': dict(sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:20]),
    'summary': {
        'total_articles': len(df),
        'positive_count': (df['sentiment'] == 'positive').sum(),
        'negative_count': (df['sentiment'] == 'negative').sum(),
        'neutral_count': (df['sentiment'] == 'neutral').sum(),
        'housing_related_count': df['contains_housing'].sum(),
        'overall_sentiment': float(df['sentiment_score'].mean()),
        'housing_sentiment': float(df[df['contains_housing']]['sentiment_score'].mean()),
        'non_housing_sentiment': float(df[~df['contains_housing']]['sentiment_score'].mean()),
        'date_range': {
            'start': df['date'].min().strftime('%Y-%m-%d'),
            'end': df['date'].max().strftime('%Y-%m-%d')
        },
        'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
}

# Save to JSON file
with open('hsi_processed_data.json', 'w') as f:
    json.dump(hsi_data, f, indent=2, default=str)

print(f"‚úÖ Data saved to 'hsi_processed_data.json'")
print(f"\nüìä SUMMARY STATISTICS:")
for key, value in hsi_data['summary'].items():
    if isinstance(value, dict):
        print(f"   {key}:")
        for sub_key, sub_value in value.items():
            print(f"     {sub_key}: {sub_value}")
    else:
        print(f"   {key}: {value}")

# Save processed DataFrame for reference
df.to_csv('processed_housing_sentiment.csv', index=False)
print(f"\n‚úÖ Processed data saved to 'processed_housing_sentiment.csv'")

‚úÖ Data saved to 'hsi_processed_data.json'

üìä SUMMARY STATISTICS:
   total_articles: 4846
   positive_count: 1363
   negative_count: 604
   neutral_count: 2879
   housing_related_count: 1664
   overall_sentiment: 0.1566240198101527
   housing_sentiment: 0.18028846153846154
   non_housing_sentiment: 0.14424890006285354
   date_range:
     start: 2018-01-01
     end: 2020-12-31
   analysis_date: 2025-12-05 23:37:45

‚úÖ Processed data saved to 'processed_housing_sentiment.csv'


**REATE DASHBOARD HTML REPORT**

In [21]:
# Create a simple HTML dashboard
html_report = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Housing Sentiment Index Dashboard</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 40px;
            background-color: #f5f5f5;
        }}
        .container {{
            max-width: 1200px;
            margin: 0 auto;
            background: white;
            padding: 30px;
            border-radius: 10px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }}
        .header {{
            text-align: center;
            margin-bottom: 40px;
            padding-bottom: 20px;
            border-bottom: 3px solid #2196F3;
        }}
        .stats-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin-bottom: 40px;
        }}
        .stat-card {{
            background: #f8f9fa;
            padding: 20px;
            border-radius: 8px;
            border-left: 4px solid #2196F3;
        }}
        .stat-card h3 {{
            margin-top: 0;
            color: #333;
        }}
        .stat-value {{
            font-size: 28px;
            font-weight: bold;
            color: #2196F3;
        }}
        .stat-label {{
            color: #666;
            font-size: 14px;
        }}
        .section {{
            margin-bottom: 40px;
        }}
        .section-title {{
            color: #333;
            border-bottom: 2px solid #eee;
            padding-bottom: 10px;
            margin-bottom: 20px;
        }}
        .insights {{
            background: #e3f2fd;
            padding: 20px;
            border-radius: 8px;
            margin-bottom: 30px;
        }}
        .insight-item {{
            margin-bottom: 10px;
            padding-left: 20px;
            position: relative;
        }}
        .insight-item:before {{
            content: "‚Ä¢";
            color: #2196F3;
            font-size: 20px;
            position: absolute;
            left: 0;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
            margin-top: 20px;
        }}
        th, td {{
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid #ddd;
        }}
        th {{
            background-color: #2196F3;
            color: white;
        }}
        tr:hover {{
            background-color: #f5f5f5;
        }}
        .positive {{
            color: #4CAF50;
            font-weight: bold;
        }}
        .negative {{
            color: #F44336;
            font-weight: bold;
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>üè† Housing Sentiment Index Dashboard</h1>
            <p>Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        </div>
        
        <div class="stats-grid">
            <div class="stat-card">
                <h3>Total Articles</h3>
                <div class="stat-value">{hsi_data['summary']['total_articles']}</div>
                <div class="stat-label">Analyzed</div>
            </div>
            <div class="stat-card">
                <h3>Overall Sentiment</h3>
                <div class="stat-value {'positive' if hsi_data['summary']['overall_sentiment'] > 0 else 'negative'}">
                    {hsi_data['summary']['overall_sentiment']:.3f}
                </div>
                <div class="stat-label">Average Score</div>
            </div>
            <div class="stat-card">
                <h3>Housing Articles</h3>
                <div class="stat-value">{hsi_data['summary']['housing_related_count']}</div>
                <div class="stat-label">{hsi_data['summary']['housing_related_count']/hsi_data['summary']['total_articles']:.1%} of total</div>
            </div>
            <div class="stat-card">
                <h3>Positive Ratio</h3>
                <div class="stat-value">{hsi_data['summary']['positive_count']/hsi_data['summary']['total_articles']:.1%}</div>
                <div class="stat-label">Positive articles</div>
            </div>
        </div>
        
        <div class="section">
            <h2 class="section-title">üìà Key Insights</h2>
            <div class="insights">
                <div class="insight-item">Housing articles show {'higher' if hsi_data['summary']['housing_sentiment'] > hsi_data['summary']['non_housing_sentiment'] else 'lower'} sentiment than non-housing articles</div>
                <div class="insight-item">Sentiment ranges from {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}</div>
                <div class="insight-item">Weekly HSI calculated with {len(hsi_data['weekly'])} data points</div>
                <div class="insight-item">Most common housing keyword: {list(hsi_data['keywords'].items())[0][0]} ({list(hsi_data['keywords'].items())[0][1]} occurrences)</div>
            </div>
        </div>
        
        <div class="section">
            <h2 class="section-title">üîù Top Keywords</h2>
            <table>
                <thead>
                    <tr>
                        <th>Keyword</th>
                        <th>Count</th>
                        <th>Percentage</th>
                    </tr>
                </thead>
                <tbody>
                    {"".join([f'''
                    <tr>
                        <td>{keyword}</td>
                        <td>{count}</td>
                        <td>{count/hsi_data['summary']['total_articles']:.1%}</td>
                    </tr>
                    ''' for keyword, count in list(hsi_data['keywords'].items())[:10]])}
                </tbody>
            </table>
        </div>
        
        <div class="section">
            <h2 class="section-title">üìä Data Overview</h2>
            <p>Data available via API endpoints:</p>
            <ul>
                <li><code>/api/hsi/weekly</code> - Weekly HSI data</li>
                <li><code>/api/hsi/monthly</code> - Monthly HSI data</li>
                <li><code>/api/hsi/summary</code> - Summary statistics</li>
                <li><code>/api/hsi/keywords</code> - Keyword analysis</li>
            </ul>
            <p>Run the FastAPI app to access the full API.</p>
        </div>
        
        <div class="section">
            <h2 class="section-title">üìã Sample Data</h2>
            <table>
                <thead>
                    <tr>
                        <th>Date</th>
                        <th>Sentiment</th>
                        <th>Housing</th>
                        <th>Text Preview</th>
                    </tr>
                </thead>
                <tbody>
                    {"".join([f'''
                    <tr>
                        <td>{row["date"].split("T")[0] if "T" in str(row["date"]) else row["date"]}</td>
                        <td class="{row["sentiment"]}">{row["sentiment"]}</td>
                        <td>{"Yes" if row["contains_housing"] else "No"}</td>
                        <td>{row["text"][:80]}...</td>
                    </tr>
                    ''' for row in hsi_data['daily'][:5]])}
                </tbody>
            </table>
        </div>
        
        <div class="section">
            <p><em>Generated by Housing Sentiment Analysis Pipeline</em></p>
            <p><strong>Next Steps:</strong></p>
            <ol>
                <li>Run the FastAPI app: <code>python app.py</code></li>
                <li>Access the API at: <code>http://localhost:8000</code></li>
                <li>View interactive charts in Jupyter notebook</li>
                <li>Use the processed data for further analysis</li>
            </ol>
        </div>
    </div>
</body>
</html>
"""

# Save HTML report
with open('hsi_dashboard.html', 'w', encoding='utf-8') as f:
    f.write(html_report)

print(f"\n‚úÖ HTML dashboard saved to 'hsi_dashboard.html'")
print(f"‚úÖ Open 'hsi_dashboard.html' in your browser to view the report")


‚úÖ HTML dashboard saved to 'hsi_dashboard.html'
‚úÖ Open 'hsi_dashboard.html' in your browser to view the report
