### Works

In [42]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from scipy.stats import bootstrap
import warnings
import statsmodels.api as sm

warnings.filterwarnings('ignore')

class AdvancedRentalPricingAnalyzer:
    def __init__(self):
        self.df = None
        self.semi_furnished_data = pd.DataFrame()
        self.market_segment_data = None
        self.models = {}
        self.prototype_features = {} # To store average features for prediction
        self.society_clusters = {}
        self.price_ranges = {}
        self.MIN_SAMPLES_SOCIETY = 3 # Min samples for a high-confidence society-level estimate
        self.MIN_SAMPLES_SEGMENT = 5 # Min samples for a market segment-level estimate
        self.MIN_SAMPLES_MODEL = 10 # Min samples to train a predictive model


    def load_and_preprocess_data(self, data_path):
        """Load and clean the rental data, handling non-standard headers."""
        try:
            temp_df = pd.read_csv(data_path)
            if len(temp_df) < 2:
                print("Warning: CSV file is empty or contains no data rows.")
                self.df = pd.DataFrame()
                return self.df
             
            new_header = temp_df.iloc[0]
            df = temp_df[1:].copy()
            df.columns = new_header
             
            if df.columns[0] is None or 'unnamed' in str(df.columns[0]).lower():
                df = df.drop(df.columns[0], axis=1)
             
            self.df = df
        except Exception as e:
            print(f"An error occurred while loading or processing the data file: {e}")
            self.df = pd.DataFrame()
            return self.df

        self.df.columns = self.df.columns.str.strip()
         
        # Centralized data cleaning pipeline
        self._clean_and_convert_numeric_columns()
        self._clean_categorical_variables()
        self._handle_missing_values()
        self._create_derived_features()
         
        self.semi_furnished_data = self.df[
            (self.df['Furnishing'].str.lower().str.contains('semi', na=False)) &
            (self.df['# of rooms'].isin(['BHK2', 'BHK3']))
        ].copy()
         
        print(f"Total semi-furnished 2 & 3 BHK properties: {len(self.semi_furnished_data)}")
        return self.semi_furnished_data

    def _clean_and_convert_numeric_columns(self):
        """Clean and convert all relevant columns to numeric types."""
        price_cols = ['Price', 'Maintenance', 'Total price']
        for col in price_cols:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype(str).str.replace(',', '').str.replace('"', '').str.strip()
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
         
        other_numeric_cols = ['Super Area', 'Floor', 'Total floors', '# of bathrooms', '# of balcony']
        for col in other_numeric_cols:
            if col in self.df.columns:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')

    def _clean_categorical_variables(self):
        """Standardize categorical variables"""
        self.df['Society Name'] = self.df['Society Name'].str.title().str.strip()
        self.df['# of rooms'] = self.df['# of rooms'].str.upper().str.strip()
        self.df['Furnishing'] = self.df['Furnishing'].str.strip()
        if 'Facing' in self.df.columns:
            self.df['Facing'] = self.df['Facing'].fillna("Don't Know")

    def _handle_missing_values(self):
        """Handle missing values with domain-specific logic"""
        if '# of balcony' in self.df.columns:
            self.df['# of balcony'] = self.df['# of balcony'].fillna(
                self.df.groupby('# of rooms')['# of balcony'].transform('median')
            )
        if '# of bathrooms' in self.df.columns:
            self.df['# of bathrooms'] = self.df['# of bathrooms'].fillna(
                self.df.groupby('# of rooms')['# of bathrooms'].transform('median')
            )

    def _create_derived_features(self):
        """
        Create engineered features for analysis.
        ENHANCED: Uses one-hot encoding for 'Facing' for more robust modeling.
        """
        super_area_safe = self.df['Super Area'].replace(0, np.nan)
        total_floors_safe = self.df['Total floors'].replace(0, np.nan)
        price_safe = self.df['Price'].replace(0, np.nan)
        
        self.df['Price_per_sqft'] = self.df['Total price'] / super_area_safe
        self.df['Floor_Premium_Index'] = self.df['Floor'] / total_floors_safe
        self.df['Maintenance_Ratio'] = self.df['Maintenance'] / price_safe

        # --- REVISED FEATURE ENGINEERING FOR 'Facing' ---
        if 'Facing' in self.df.columns:
            # Use One-Hot Encoding instead of a manual numeric scale
            facing_dummies = pd.get_dummies(self.df['Facing'], prefix='Facing', drop_first=True)
            # drop_first=True avoids perfect multicollinearity by dropping one category (e.g., "Don't Know")
            # The effect of the dropped category is captured by the model's intercept.
            
            self.df = pd.concat([self.df, facing_dummies], axis=1)

            # NOTE: We no longer create 'Facing_Premium'.
            # The model will learn the premium for each direction from the new columns.

    def detect_and_handle_outliers(self):
        """Detect outliers on a per-group basis using the IQR method."""
        if 'Total price' not in self.semi_furnished_data.columns:
            print("'Total price' column not found. Skipping outlier detection.")
            return

        def iqr_outlier_detector(group):
            if len(group) < 3:
                return pd.Series(False, index=group.index)
            Q1 = group['Total price'].quantile(0.25)
            Q3 = group['Total price'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            return (group['Total price'] < lower_bound) | (group['Total price'] > upper_bound)
         
        outlier_flags = self.semi_furnished_data.groupby(['Society Name', '# of rooms'], group_keys=False).apply(iqr_outlier_detector)
        self.semi_furnished_data['Is_Outlier'] = outlier_flags
        outlier_count = self.semi_furnished_data['Is_Outlier'].sum()
        print(f"Identified {outlier_count} outliers within their respective groups ({outlier_count/len(self.semi_furnished_data)*100:.1f}%)")

    def perform_market_segmentation(self):
        """Perform K-means clustering and tag data with market segments."""
        data_for_clustering = self.semi_furnished_data[~self.semi_furnished_data['Is_Outlier'].fillna(False)]
         
        clustering_features = ['Super Area', 'Price_per_sqft']
        society_stats = data_for_clustering.groupby('Society Name').agg({
            'Super Area': 'mean', 'Price_per_sqft': 'mean', 'Total price': 'mean'
        }).dropna().reset_index()

        if len(society_stats) < 3:
            print("Not enough societies to perform clustering. Using a single 'Mid-Market' segment.")
            self.society_clusters = {name: 'Mid-Market' for name in society_stats['Society Name'].unique()}
        else:
            scaler = StandardScaler()
            features_scaled = scaler.fit_transform(society_stats[clustering_features])
            kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
            society_stats['Market_Segment'] = kmeans.fit_predict(features_scaled)
             
            cluster_prices = society_stats.groupby('Market_Segment')['Total price'].mean()
            cluster_labels = {
                cluster_prices.idxmin(): 'Budget',
                cluster_prices.idxmax(): 'Premium',
                cluster_prices.drop([cluster_prices.idxmin(), cluster_prices.idxmax()]).index[0]: 'Mid-Market'
            }
            society_stats['Market_Segment_Label'] = society_stats['Market_Segment'].map(cluster_labels)
            self.society_clusters = dict(zip(society_stats['Society Name'], society_stats['Market_Segment_Label']))

        # HIERARCHICAL MODELING: Tag original data with market segments for fallback estimation
        self.market_segment_data = self.semi_furnished_data.copy()
        self.market_segment_data['Market_Segment'] = self.market_segment_data['Society Name'].map(self.society_clusters)
        self.market_segment_data['Market_Segment'].fillna('Mid-Market', inplace=True) # Fallback for new societies
        print("Market segmentation complete and data tagged.")
        return society_stats
        
    def _train_predictive_models(self):
        """
        Trains a Linear Regression model for each market segment and BHK type,
        AND prints a statistical summary to validate feature coefficients.
        """
        print("Training predictive models for price estimation...")
        data_to_use = self.market_segment_data[~self.market_segment_data['Is_Outlier'].fillna(False)].copy()
        
        features = ['Super Area', '# of bathrooms', '# of balcony', 'Floor_Premium_Index']
        for feature in features:
            if feature not in data_to_use.columns:
                print(f"Warning: Feature '{feature}' not found, skipping for modeling.")
                features.remove(feature)
                continue
            data_to_use[feature] = data_to_use.groupby(['Market_Segment', '# of rooms'])[feature].transform(
                lambda x: x.fillna(x.median())
            )
        
        data_to_use.dropna(subset=features + ['Total price'], inplace=True)

        for segment in data_to_use['Market_Segment'].unique():
            for bhk_type in data_to_use['# of rooms'].unique():
                model_key = (segment, bhk_type)
                
                subset = data_to_use[
                    (data_to_use['Market_Segment'] == segment) & (data_to_use['# of rooms'] == bhk_type)
                ]
                
                if len(subset) >= self.MIN_SAMPLES_MODEL:
                    X = subset[features]
                    y = subset['Total price']
                    
                    # --- 1. Scikit-learn model for prediction (existing code) ---
                    model = LinearRegression()
                    model.fit(X, y)
                    self.models[model_key] = model
                    self.prototype_features[model_key] = X.mean().values.reshape(1, -1)
                    print(f"\n✅ Scikit-learn model trained for: {model_key} (based on {len(subset)} samples)")

                    # --- 2. Statsmodels for statistical analysis (NEW SNIPPET) ---
                    # statsmodels requires manually adding a constant for the intercept
                    X_sm = sm.add_constant(X)
                    
                    # Fit the Ordinary Least Squares (OLS) model
                    sm_model = sm.OLS(y, X_sm).fit()
                    
                    # Print the detailed summary
                    print(f"--- Statistical Summary for {model_key} ---")
                    print(sm_model.summary())
                    print("--------------------------------------------------")

    def _get_price_range_from_subset(self, prices):
        """Helper function to calculate a price range from a set of prices."""
        if len(prices) < 2:
            return None, None
             
        range_start = np.percentile(prices, 25)
        range_end = np.percentile(prices, 75)
         
        if len(prices) >= 5: # Use more stable bootstrap for larger samples
            try:
                bootstrap_result = bootstrap((prices,), np.mean, n_resamples=1000, confidence_level=0.80, random_state=42)
                ci_low, ci_high = bootstrap_result.confidence_interval
                range_start = min(range_start, ci_low)
                range_end = max(range_end, ci_high)
            except Exception:
                pass # Fallback to percentile if bootstrap fails
         
        return range_start, range_end

    def calculate_hierarchical_price_ranges(self, target_societies):
        """
        Calculates price ranges using a hierarchical model.
        1. Society-level statistics (if enough data)
        2. Predictive model (for unknown societies)
        3. Market-segment statistics (fallback)
        4. Global statistics (final fallback)
        """
        # Train models before starting calculations
        self._train_predictive_models()

        results = {}
        data_to_use = self.market_segment_data[~self.market_segment_data['Is_Outlier'].fillna(False)]
         
        for _, row in target_societies.iterrows():
            society, bhk_type = row['Society Name'], row['BHK']
             
            range_start, range_end, level, sample_size = None, None, 'N/A', 0
            is_known_society = society in self.society_clusters
     
            # --- Tier 1: Society Level ---
            if is_known_society:
                subset_society = data_to_use[
                    (data_to_use['Society Name'] == society) & (data_to_use['# of rooms'] == bhk_type)
                ]
                if len(subset_society) >= self.MIN_SAMPLES_SOCIETY:
                    prices = subset_society['Total price'].values
                    range_start, range_end = self._get_price_range_from_subset(prices)
                    level, sample_size = 'Society', len(prices)
     
            # --- Tier 2: Predictive Model (for Unknown Societies) ---
            if range_start is None and not is_known_society:
                market_segment = 'Mid-Market' # Assume unknown societies are Mid-Market
                model_key = (market_segment, bhk_type)
                
                if model_key in self.models:
                    model = self.models[model_key]
                    prototype = self.prototype_features[model_key]
                    predicted_price = model.predict(prototype)[0]
                    
                    # Create a plausible range around the prediction, e.g., +/- 15%
                    range_start = predicted_price * 0.85
                    range_end = predicted_price * 1.15
                    level, sample_size = 'Predictive Model (Imputed)', 0 # 0 samples as it's a prediction
     
            # --- Tier 3: Market Segment Level (Fallback) ---
            if range_start is None:
                market_segment = self.society_clusters.get(society, 'Mid-Market')
                level = 'Market Segment'
     
                subset_segment = data_to_use[
                    (data_to_use['Market_Segment'] == market_segment) & (data_to_use['# of rooms'] == bhk_type)
                ]
                if len(subset_segment) >= self.MIN_SAMPLES_SEGMENT:
                    prices = subset_segment['Total price'].values
                    range_start, range_end = self._get_price_range_from_subset(prices)
                    sample_size = len(prices)
     
            # --- Tier 4: Global Level (Final fallback) ---
            if range_start is None:
                subset_global = data_to_use[data_to_use['# of rooms'] == bhk_type]
                if not subset_global.empty:
                    prices = subset_global['Total price'].values
                    range_start, range_end = self._get_price_range_from_subset(prices)
                    level, sample_size = 'Global', len(prices)
             
            if range_start is None or range_end is None:
                continue
     
            # Post-processing and storing results
            market_segment = self.society_clusters.get(society, 'Mid-Market')
            volatility_multiplier = {'Budget': 0.95, 'Mid-Market': 1.0, 'Premium': 1.05}[market_segment]
            range_start *= volatility_multiplier
            range_end *= volatility_multiplier
     
            if range_end > 1.8 * range_start and range_start > 0:
                range_end = 1.7 * range_start
     
            results[(society, bhk_type)] = {
                'Range_Starting': int(np.floor(range_start / 100) * 100),
                'Range_Ending': int(np.ceil(range_end / 100) * 100),
                'Sample_Size': sample_size,
                'Market_Segment': market_segment,
                'Estimation_Level': level
            }
         
        self.price_ranges = results
        print("Hierarchical price range calculation complete for all target societies.")
        return results

    def create_final_output(self, target_societies):
            """Creates final output and imputes average size for unknown societies."""
            output_data = []
            for (society, bhk_type), ranges in self.price_ranges.items():
                output_data.append({
                    'Society Name': society,
                    'BHK': bhk_type,
                    'Range Starting': f"₹{ranges['Range_Starting']:,}",
                    'Range Ending': f"₹{ranges['Range_Ending']:,}",
                    'Sample Size': ranges['Sample_Size'],
                    'Market Segment': ranges['Market_Segment'],
                    'Estimation Level': ranges['Estimation_Level']
                })
             
            if not output_data:
                print("No price ranges were generated. Check the input data.")
                return pd.DataFrame()
     
            output_df = pd.DataFrame(output_data)
     
            # Calculate average sizes for KNOWN societies
            avg_sizes_known = self.semi_furnished_data.groupby(['Society Name', '# of rooms'])['Super Area'].mean().reset_index()
            avg_sizes_known.rename(columns={'# of rooms': 'BHK', 'Super Area': 'Size (sq.ft)'}, inplace=True)
             
            # Merge the target list with calculated ranges and known sizes
            final_df = pd.merge(target_societies, output_df, on=['Society Name', 'BHK'], how='left')
            final_df = pd.merge(final_df, avg_sizes_known, on=['Society Name', 'BHK'], how='left')
     
            # Impute size for societies that had no data (including those priced by the predictive model)
            if final_df['Size (sq.ft)'].isnull().any():
                avg_size_imputed = self.market_segment_data.groupby(['Market_Segment', '# of rooms'])['Super Area'].mean()
                mid_market_bhk2_size = avg_size_imputed.get(('Mid-Market', 'BHK2'), 1200) # Default if no mid-market data
                mid_market_bhk3_size = avg_size_imputed.get(('Mid-Market', 'BHK3'), 1600)
     
                # Fill missing sizes based on BHK type
                bhk2_missing_size = (final_df['Size (sq.ft)'].isnull()) & (final_df['BHK'] == 'BHK2')
                bhk3_missing_size = (final_df['Size (sq.ft)'].isnull()) & (final_df['BHK'] == 'BHK3')
                final_df.loc[bhk2_missing_size, 'Size (sq.ft)'] = mid_market_bhk2_size
                final_df.loc[bhk3_missing_size, 'Size (sq.ft)'] = mid_market_bhk3_size
     
            final_df['Size (sq.ft)'] = final_df['Size (sq.ft)'].round().astype(int)
             
            submission_cols = ['Society Name', 'BHK', 'Size (sq.ft)', 'Range Starting', 'Range Ending', 'Estimation Level', 'Market Segment']
            for col in submission_cols:
                if col not in final_df.columns:
                    final_df[col] = np.nan
                     
            return final_df[submission_cols].sort_values(['Society Name', 'BHK'])


# --- NEW SANITY CHECK FUNCTION DEFINITION ---
def sanity_check_output(final_output_df, reference_data_df):
    """
    Performs a sanity check on the generated price ranges by comparing them
    against the actual min/max prices from the reference data.

    Args:
        final_output_df (pd.DataFrame): The final DataFrame with generated ranges.
        reference_data_df (pd.DataFrame): The preprocessed data used for the analysis
                                           (e.g., analyzer.market_segment_data).

    Returns:
        pd.DataFrame: A comparison DataFrame with a sanity check status.
    """
    if final_output_df.empty or reference_data_df.empty:
        print("Cannot perform sanity check: Input DataFrame is empty.")
        return pd.DataFrame()

    print("\n--- Sanity Check Report ---")
    print("Comparing generated ranges with actual prices from reference data...")

    # 1. Calculate actual min/max from reference data
    reference_stats = reference_data_df.groupby(['Society Name', '# of rooms'])['Total price'].agg(['min', 'max']).reset_index()
    reference_stats.rename(columns={
        '# of rooms': 'BHK',
        'min': 'Actual Min Price',
        'max': 'Actual Max Price'
    }, inplace=True)

    # 2. Prepare the final output for comparison (convert ranges back to numeric)
    comparison_df = final_output_df.copy()
    
    # Clean currency formatting to convert to numbers for comparison
    try:
        comparison_df['Range Starting Num'] = comparison_df['Range Starting'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
        comparison_df['Range Ending Num'] = comparison_df['Range Ending'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
    except (ValueError, AttributeError) as e:
        print(f"Could not convert range columns to numeric. Error: {e}")
        return final_output_df # Return original if conversion fails
        
    # 3. Merge with actual stats
    comparison_df = pd.merge(comparison_df, reference_stats, on=['Society Name', 'BHK'], how='left')

    # 4. Determine the sanity check status
    conditions = [
        comparison_df['Actual Min Price'].isnull(),
        # Condition for a good overlap: generated range and actual range intersect
        (comparison_df['Range Starting Num'] <= comparison_df['Actual Max Price']) & (comparison_df['Actual Min Price'] <= comparison_df['Range Ending Num']),
        # Condition for predicting too high: generated range starts after the actual one ends
        comparison_df['Range Starting Num'] > comparison_df['Actual Max Price'],
        # Condition for predicting too low: generated range ends before the actual one starts
        comparison_df['Range Ending Num'] < comparison_df['Actual Min Price']
    ]
    choices = [
        'NO_REFERENCE',  # For societies not in the original data
        'OVERLAP_OK',    # The ranges overlap, which is good
        'POSSIBLY_HIGH', # The model's range is higher than the observed data
        'POSSIBLY_LOW'   # The model's range is lower than the observed data
    ]
    comparison_df['Sanity Check'] = np.select(conditions, choices, default='CHECK_LOGIC')

    # 5. Format for readability
    report_cols = [
        'Society Name', 'BHK', 'Estimation Level',
        'Range Starting', 'Range Ending',
        'Actual Min Price', 'Actual Max Price', 'Sanity Check'
    ]
    final_report = comparison_df[report_cols].copy()
    
    # Format numeric columns for better display, handling NaNs
    for col in ['Actual Min Price', 'Actual Max Price']:
        final_report[col] = final_report[col].apply(lambda x: f"₹{int(x):,}" if pd.notna(x) else 'N/A')

    return final_report


# --- UPDATED MAIN USAGE EXAMPLE ---
def main():
    target_properties_list = [
        ('Brigade Cosmopolis', 'BHK2'), ('Brigade Cosmopolis', 'BHK3'),
        ('Brigade Woods', 'BHK2'), ('Brigade Woods', 'BHK3'),
        ('Casa Gopalan', 'BHK2'), ('Casa Gopalan', 'BHK3'),
        ('Citilights Rustique', 'BHK2'), ('Citilights Rustique', 'BHK3'),
        ('DSR Green Vista', 'BHK2'), ('DSR Green Vista', 'BHK3'),
        ('Godrej United', 'BHK2'), ('Godrej United', 'BHK3'),
        ('Gopalan Atlantis', 'BHK2'), ('Gopalan Atlantis', 'BHK3'),
        ('Mahaveer Tranquil', 'BHK2'), ('Mahaveer Tranquil', 'BHK3'),
        ('Prestige Boulevard', 'BHK2'), ('Prestige Boulevard', 'BHK3'),
        ('Prestige Palms', 'BHK2'), ('Prestige Palms', 'BHK3'),
        ('Prestige Waterford', 'BHK2'), ('Prestige Waterford', 'BHK3'),
        ('Sobha Dream Acres', 'BHK2'), ('Sobha Dream Acres', 'BHK3'),
        ('Sobha Habitech', 'BHK2'), ('Sobha Habitech', 'BHK3'),
        ('Sobha Rose', 'BHK2'), ('Sobha Rose', 'BHK3'),
        ('Sobha Windsor', 'BHK3'),
        ('Sumadhura Silver Ripples', 'BHK2'), ('Sumadhura Silver Ripples', 'BHK3'),
        ('Sumadhura Soham', 'BHK2'), ('Sumadhura Soham', 'BHK3'),
    ]
    target_societies_df = pd.DataFrame(target_properties_list, columns=['Society Name', 'BHK'])
    target_societies_df['Society Name'] = target_societies_df['Society Name'].str.title().str.strip()

    data_file_path = 'rental_data.csv' 
    analyzer = AdvancedRentalPricingAnalyzer()
     
    try:
        analyzer.load_and_preprocess_data(data_file_path)
    except FileNotFoundError:
        print(f"ERROR: The data file was not found at '{data_file_path}'")
        return

    if analyzer.semi_furnished_data.empty:
        print("No valid semi-furnished 2 or 3 BHK data found to analyze.")
        return

    analyzer.detect_and_handle_outliers()
    analyzer.perform_market_segmentation()
    analyzer.calculate_hierarchical_price_ranges(target_societies_df)
    final_output = analyzer.create_final_output(target_societies_df)
     
    if final_output is None or final_output.empty:
        print("Processing finished, but no output was generated.")
        return

    output_filename = 'rental_price_recommendations_hierarchical.csv'
    final_output.to_csv(output_filename, index=False)
     
    print("\n------------------------------------------------------------")
    print(f"Analysis complete! Results saved to '{output_filename}'")
    print("------------------------------------------------------------\n")
    print("Sample of the final output:\n")
    print(final_output.head(10))

    # --- Add this code temporarily inside main() after loading data ---

    print("\n--- Controlled Sanity Check for Facing Premium ---")
    
    # Control for both BHK type and normalize for size
    controlled_check = analyzer.market_segment_data.groupby(['# of rooms', 'Facing'])['Price_per_sqft'].agg(['mean', 'count'])
    
    print(controlled_check)
    
    # --- End of temporary code ---

    # Perform and display the sanity check
    sanity_report = sanity_check_output(final_output, analyzer.market_segment_data)
    if not sanity_report.empty:
        print("\n")
        print(sanity_report)
        print("---------------------------")
        # You can also save this report to a file
        sanity_report.to_csv('sanity_check_report.csv', index=False)
        print("Sanity check report saved to 'sanity_check_report.csv'")


if __name__ == "__main__":
    main()

Total semi-furnished 2 & 3 BHK properties: 262
Identified 23 outliers within their respective groups (8.8%)
Market segmentation complete and data tagged.
Training predictive models for price estimation...

✅ Scikit-learn model trained for: ('Premium', 'BHK3') (based on 40 samples)
--- Statistical Summary for ('Premium', 'BHK3') ---
                            OLS Regression Results                            
Dep. Variable:            Total price   R-squared:                       0.634
Model:                            OLS   Adj. R-squared:                  0.592
Method:                 Least Squares   F-statistic:                     15.17
Date:                Tue, 19 Aug 2025   Prob (F-statistic):           2.76e-07
Time:                        20:16:06   Log-Likelihood:                -415.77
No. Observations:                  40   AIC:                             841.5
Df Residuals:                      35   BIC:                             850.0
Df Model:                         

In [43]:
pd.read_csv('sanity_check_report.csv')

Unnamed: 0,Society Name,BHK,Estimation Level,Range Starting,Range Ending,Actual Min Price,Actual Max Price,Sanity Check
0,Brigade Cosmopolis,BHK2,Society,"₹35,400","₹56,100","₹34,000","₹71,000",OVERLAP_OK
1,Brigade Cosmopolis,BHK3,Society,"₹73,200","₹74,900","₹68,000","₹90,000",OVERLAP_OK
2,Brigade Woods,BHK2,Society,"₹61,500","₹68,300","₹55,500","₹70,800",OVERLAP_OK
3,Brigade Woods,BHK3,Society,"₹72,500","₹82,200","₹69,000","₹82,500",OVERLAP_OK
4,Casa Gopalan,BHK2,Society,"₹45,600","₹48,500","₹45,000","₹54,000",OVERLAP_OK
5,Casa Gopalan,BHK3,Market Segment,"₹54,100","₹69,900","₹50,000","₹50,000",POSSIBLY_HIGH
6,Citilights Rustique,BHK2,Society,"₹45,000","₹48,000","₹23,000","₹49,000",OVERLAP_OK
7,Citilights Rustique,BHK3,Society,"₹60,000","₹62,900","₹50,000","₹65,000",OVERLAP_OK
8,Dsr Green Vista,BHK2,Society,"₹41,100","₹43,700","₹20,000","₹48,500",OVERLAP_OK
9,Dsr Green Vista,BHK3,Society,"₹51,600","₹51,800","₹51,200","₹54,500",OVERLAP_OK


### Visualisation

In [58]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Professional Visualization Suite for Bengaluru House Pricing Analysis.

This script generates four high-impact visuals designed for a business audience,
transforming raw rental data into strategic market insights.
- Visual 1: Raincloud plot of price distribution by property type.
- Visual 2: Focused correlation heatmap of key price drivers.
- Visual 3: Strategic market segmentation map using K-Means clustering.
- Visual 4: Data quality and model confidence dashboard.

All functions are self-contained and process data from the source CSVs.
"""

# ==============================================================================
# 1. IMPORTS & PROFESSIONAL THEME CONFIGURATION
# ==============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import sys

# --- Define a cohesive, professional color palette ---
PALETTE = {
    'primary_blue': '#003f5c',
    'secondary_teal': '#4d908e',
    'accent_green': '#58b28a',
    'light_grey': '#f0f0f0',
    'medium_grey': '#a9a9a9',
    'dark_grey': '#333333'
}

# --- Refined Professional Plot Style ---
try:
    plt.rcParams.update({
        'figure.figsize': (16, 9),
        'font.family': 'Avenir',
        'font.sans-serif': ['Avenir', 'Gill Sans MT', 'Calibri', 'sans-serif'],
        'axes.titleweight': 'bold',
        'axes.labelweight': 'bold',
        'axes.titlepad': 25,
        'axes.labelpad': 15,
        'axes.titlesize': 24,
        'axes.labelsize': 18,
        'xtick.labelsize': 14,
        'ytick.labelsize': 14,
        'legend.fontsize': 14,
        'axes.spines.top': False,
        'axes.spines.right': False,
        'axes.edgecolor': PALETTE['medium_grey'],
        'grid.color': PALETTE['light_grey'],
        'grid.linestyle': '-',
        'grid.linewidth': 1,
        'text.color': PALETTE['dark_grey'],
        'axes.titlecolor': PALETTE['primary_blue'],
        'axes.labelcolor': PALETTE['dark_grey']
    })
    sns.set_theme(style="whitegrid")
except Exception:
    print("Avenir font not found, falling back to default sans-serif.")
    sns.set_theme(style="whitegrid")


# ==============================================================================
# 2. ROBUST DATA LOADING & PREPROCESSING
# ==============================================================================

def load_and_clean_rental_data(data_path):
    """
    Loads the rental_data.csv, handles parsing issues, and cleans numeric columns.
    """
    try:
        # ===== FIX 1: Use header=1 to specify the *second* row is the header =====
        # This solves the 'Unnamed: X' column issue. index_col=0 handles the leading comma.
        df = pd.read_csv(data_path, header=1, index_col=0)
    except FileNotFoundError:
        print(f"FATAL ERROR: The file '{data_path}' was not found. Please ensure it is in the same directory.")
        return None
    except Exception as e:
        print(f"Error reading {data_path}: {e}")
        return None

    df.columns = df.columns.str.strip() # Clean column names

    # Clean price-related columns
    price_cols = ['Price', 'Maintenance', 'Total price']
    for col in price_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace(r'[₹,]', '', regex=True), errors='coerce')

    # Clean other numeric columns
    other_numeric_cols = ['Super Area', 'Floor', 'Total floors', '# of bathrooms', '# of balcony']
    for col in other_numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df


# ==============================================================================
# 3. ENHANCED VISUALIZATION FUNCTIONS
# ==============================================================================

# --- Visual 1: Price Distribution ---
def plot_price_distribution_raincloud(data_path, save_path="1_price_distribution_raincloud.png"):
    """
    Generates a professional Raincloud Plot to showcase price distributions.
    """
    print("Generating Visual 1: Price Distribution Raincloud Plot...")
    data_df = load_and_clean_rental_data(data_path)
    if data_df is None: return

    try:
        data_df = data_df[data_df['Furnishing'].str.lower().str.contains('semi', na=False)].copy()
        data_df['# of rooms'] = data_df['# of rooms'].str.upper().str.strip()
        data_df = data_df.dropna(subset=['Total price', '# of rooms'])

        price_cap = data_df['Total price'].quantile(0.99)
        viz_data = data_df[(data_df['Total price'] < price_cap) & (data_df['# of rooms'].isin(['BHK2', 'BHK3']))]
    except KeyError as e:
        print(f"  -> Error processing data for raincloud plot. Missing column: {e}")
        print(f"  -> Available columns: {list(data_df.columns)}")
        return
    except Exception as e:
        print(f"  -> An unexpected error occurred: {e}")
        return

    fig, ax = plt.subplots(figsize=(16, 10))
    colors = [PALETTE['secondary_teal'], PALETTE['accent_green']]
    order = ['BHK2', 'BHK3']

    sns.stripplot(x='# of rooms', y='Total price', data=viz_data, order=order, ax=ax, jitter=0.3, alpha=0.15, palette=colors, size=4, zorder=1)
    sns.boxplot(x='# of rooms', y='Total price', data=viz_data, order=order, ax=ax, showfliers=False, showbox=False, showcaps=False, whis=False, medianprops={'color': PALETTE['dark_grey'], 'linewidth': 3}, zorder=3)
    sns.violinplot(x='# of rooms', y='Total price', data=viz_data, order=order, ax=ax, inner=None, cut=0, bw_adjust=.5, palette=colors, saturation=0.8, linewidth=2, edgecolor=PALETTE['dark_grey'], zorder=2)

    ax.set_title("BHK3 Rentals Command a Significant, but Overlapping, Price Premium", loc='left')
    fig.suptitle("INSIGHT: BENGALURU RENTAL MARKET PRICING STRUCTURE", fontsize=16, color=PALETTE['medium_grey'], ha='left', x=0.125, y=0.96)
    ax.set_xlabel("Property Type")
    ax.set_ylabel("Total Monthly Rent")

    formatter = mticker.FuncFormatter(lambda x, p: f'₹{int(x/1000):,}k')
    ax.yaxis.set_major_formatter(formatter)

    medians = viz_data.groupby('# of rooms')['Total price'].median()
    for i, room_type in enumerate(order):
        median_val = medians.get(room_type)
        if median_val:
            ax.text(i, median_val, f'Median: ₹{int(median_val):,}', ha='center', va='bottom', fontsize=14, color=PALETTE['dark_grey'], fontweight='bold', bbox=dict(facecolor='white', alpha=0.5, edgecolor='none', boxstyle='round,pad=0.2'))

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  -> Saved to {save_path}")


# --- Visual 2: Correlation Heatmap ---
def plot_correlation_heatmap_focused(data_path, save_path="2_correlation_heatmap_focused.png"):
    """
    Generates a professional, focused correlation heatmap with a diverging colormap.
    """
    print("Generating Visual 2: Focused Feature Correlation Heatmap...")
    data_df = load_and_clean_rental_data(data_path)
    if data_df is None: return

    try:
        data_df = data_df[data_df['Furnishing'].str.lower().str.contains('semi', na=False)].copy()
        data_df['Price_per_sqft'] = data_df['Total price'] / data_df['Super Area'].replace(0, np.nan)
        data_df['Floor_Premium_Index'] = data_df['Floor'] / data_df['Total floors'].replace(0, np.nan)
        data_df['Maintenance_Ratio'] = data_df['Maintenance'] / data_df['Price'].replace(0, np.nan)
        facing_premium = {'East': 1.1, 'North-East': 1.1, 'North': 1.05, 'South-East': 1.0,'South': 0.95, 'West': 0.9, 'North-West': 0.95, 'South-West': 0.9, "Don't Know": 1.0}
        if 'Facing' in data_df.columns:
            data_df['Facing_Premium'] = data_df['Facing'].map(facing_premium).fillna(1.0)
    except KeyError as e:
        print(f"  -> Error processing data for heatmap. Missing column: {e}")
        print(f"  -> Available columns: {list(data_df.columns)}")
        return

    fig, ax = plt.subplots(figsize=(14, 12))
    features = ['Total price', 'Super Area', '# of bathrooms', '# of balcony', 'Floor_Premium_Index', 'Facing_Premium', 'Maintenance_Ratio']
    valid_features = [f for f in features if f in data_df.columns and pd.api.types.is_numeric_dtype(data_df[f])]

    corr_matrix = data_df[valid_features].corr()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    cmap = sns.diverging_palette(220, 20, as_cmap=True)

    sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap=cmap, linewidths=2, linecolor='white', cbar_kws={"shrink": .8}, annot_kws={"size": 14}, ax=ax)

    ax.set_title("Property Size and Bathrooms are Key Price Drivers", loc='left')
    fig.suptitle("INSIGHT: DECONSTRUCTING RENTAL VALUATION", fontsize=16, color=PALETTE['medium_grey'], ha='left', x=0.125, y=0.96)
    plt.xticks(rotation=30, ha='right')
    plt.yticks(rotation=0)

    ax.text(0.95, 0.05, 'Strong positive correlation (dark blue) indicates a direct relationship with Total Price.\nWeak or negative correlations (dark red) show less impact.', transform=fig.transFigure, ha="right", fontsize=14, style='italic', color=PALETTE['dark_grey'])

    plt.tight_layout(rect=[0, 0.05, 1, 0.95])
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  -> Saved to {save_path}")


# --- Visual 3: Market Segmentation ---
def confidence_ellipse(x, y, ax, n_std=2.0, facecolor='none', **kwargs):
    cov = np.cov(x, y)
    pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, facecolor=facecolor, **kwargs)
    scale_x = np.sqrt(cov[0, 0]) * n_std
    mean_x = np.mean(x)
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_y = np.mean(y)
    transf = transforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

def plot_market_segmentation_map(data_path, save_path="3_market_segmentation_map.png"):
    """
    Generates a strategic Market Segmentation Map by creating society stats on-the-fly.
    """
    print("Generating Visual 3: Strategic Market Segmentation Map...")
    data_df = load_and_clean_rental_data(data_path)
    if data_df is None: return

    try:
        society_stats_df = data_df.groupby('Society Name').agg(
            Super_Area_Avg=('Super Area', 'mean'),
            Total_Price_Avg=('Total price', 'mean')
        ).dropna()

        society_stats_df['Price_per_sqft'] = society_stats_df['Total_Price_Avg'] / society_stats_df['Super_Area_Avg']
        society_stats_df = society_stats_df.rename(columns={'Super_Area_Avg': 'Super Area'})
        features = society_stats_df[['Super Area', 'Price_per_sqft']].dropna()

        if len(features) < 11:
            print("  -> Skipping segmentation plot: Not enough distinct societies with full data.")
            return

    except KeyError as e:
        print(f"  -> Error creating society stats. Missing column: {e}")
        print(f"  -> Available columns: {list(data_df.columns)}")
        return

    features_scaled = StandardScaler().fit_transform(features)

    fig, ax = plt.subplots(figsize=(18, 10))

    kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
    features['Market_Segment'] = kmeans.fit_predict(features_scaled)
    prices = features.groupby('Market_Segment')['Price_per_sqft'].mean()
    labels = {prices.idxmin(): 'Budget', prices.idxmax(): 'Premium', prices.drop([prices.idxmin(), prices.idxmax()]).index[0]: 'Mid-Market'}
    features['Market_Segment_Label'] = features['Market_Segment'].map(labels)

    segment_palette = {'Budget': PALETTE['accent_green'], 'Mid-Market': PALETTE['secondary_teal'], 'Premium': PALETTE['primary_blue']}

    sns.scatterplot(x='Super Area', y='Price_per_sqft', hue='Market_Segment_Label', data=features, palette=segment_palette, s=100, ax=ax, alpha=0.7, style='Market_Segment_Label', markers={'Budget': 'o', 'Mid-Market': 's', 'Premium': '^'}, legend=False)

    for name, group in features.groupby('Market_Segment_Label'):
        color = segment_palette[name]
        confidence_ellipse(group['Super Area'], group['Price_per_sqft'], ax, n_std=2.0, edgecolor=color, facecolor=color, alpha=0.1, linewidth=3, linestyle='--')
        cx, cy = group['Super Area'].mean(), group['Price_per_sqft'].mean()
        ax.text(cx, cy, name, fontsize=20, fontweight='bold', color='white', ha='center', va='center', bbox=dict(boxstyle='round,pad=0.5', fc=color, ec='none'))

    ax.set_title("Bengaluru's Rental Market is Segmented into Three Tiers", loc='left')
    fig.suptitle("INSIGHT: DATA-DRIVEN COMPETITIVE LANDSCAPE", fontsize=16, color=PALETTE['medium_grey'], ha='left', x=0.125, y=0.96)
    ax.set_xlabel("Average Super Area (sq. ft.) per Society")
    ax.set_ylabel("Average Price per sq. ft. (₹) per Society")
    ax.text(0.98, 0.02, 'Segments derived via K-Means Clustering on society-level averages.\nEllipses represent 95% confidence intervals.', transform=ax.transAxes, ha='right', fontsize=12, style='italic', color=PALETTE['medium_grey'])

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  -> Saved to {save_path}")


# --- Visual 4: Sanity Check Dashboard ---
def plot_sanity_check_dashboard(data_path, save_path="4_sanity_check_dashboard.png"):
    """
    Generates a clear, dashboard-style bar chart for sanity check results.
    """
    print("Generating Visual 4: Data Quality & Confidence Report...")
    try:
        # Based on snippet, sanity check report has a standard header on the first row
        sanity_report_df = pd.read_csv(data_path)
    except FileNotFoundError:
        print(f"  -> Skipping sanity check plot: File '{data_path}' not found.")
        return

    if sanity_report_df.empty:
        print("  -> Skipping: Sanity report is empty.")
        return

    # ===== FIX 2: Use plt.subplots() to create both fig and ax explicitly =====
    # This solves the "NameError: name 'fig' is not defined" error.
    fig, ax = plt.subplots(figsize=(14, 8))
    counts = sanity_report_df['Sanity Check'].value_counts()

    status_palette = {'OVERLAP_OK': '#2ca02c', 'NO_REFERENCE': '#ff7f0e', 'POSSIBLY_LOW': '#d62728', 'POSSIBLY_HIGH': '#d62728', 'CHECK_LOGIC': '#8c564b'}
    status_order = ['OVERLAP_OK', 'NO_REFERENCE', 'POSSIBLY_LOW', 'POSSIBLY_HIGH', 'CHECK_LOGIC']
    filtered_order = [s for s in status_order if s in counts.index]

    sns.barplot(y=counts.index, x=counts.values, palette=[status_palette.get(x, PALETTE['medium_grey']) for x in filtered_order], order=filtered_order, ax=ax)

    ok_count = counts.get('OVERLAP_OK', 0)
    total_count = len(sanity_report_df)
    ok_percentage = (ok_count / total_count * 100) if total_count > 0 else 0

    ax.set_title(f"~{ok_percentage:.0f}% of Price Estimations Pass Validation", loc='left')
    fig.suptitle("INSIGHT: MODEL CONFIDENCE & DATA QUALITY", fontsize=16, color=PALETTE['medium_grey'], ha='left', x=0.125, y=0.96)

    ax.set_xlabel("Number of Properties")
    ax.set_ylabel("Validation Status")
    ax.grid(axis='y')
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_color(PALETTE['medium_grey'])

    for i, p in enumerate(ax.patches):
        width = p.get_width()
        percentage = f'{width/total_count:.1%}'
        ax.text(width + total_count*0.01, p.get_y() + p.get_height()/2., f'{int(width)} ({percentage})', ha='left', va='center', fontsize=14, fontweight='bold', color=PALETTE['dark_grey'])

    ax.set_xlim(right=ax.get_xlim()[1] * 1.15)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  -> Saved to {save_path}")


# ==============================================================================
# 4. MAIN EXECUTION SCRIPT
# ==============================================================================
if __name__ == "__main__":
    print("="*60)
    print("Beginning Professional Visualization Generation")
    print("="*60 + "\n")

    # Define paths to the source CSV files
    original_data_path = 'rental_data.csv'
    sanity_report_path = 'sanity_check_report.csv'

    # Run all visualization functions
    plot_price_distribution_raincloud(original_data_path)
    plot_correlation_heatmap_focused(original_data_path)
    plot_market_segmentation_map(original_data_path)
    plot_sanity_check_dashboard(sanity_report_path)

    print("\n" + "="*60)
    print("All professional visuals have been generated and saved.")
    print("="*60)

Beginning Professional Visualization Generation

Generating Visual 1: Price Distribution Raincloud Plot...
  -> Saved to 1_price_distribution_raincloud.png
Generating Visual 2: Focused Feature Correlation Heatmap...
  -> Saved to 2_correlation_heatmap_focused.png
Generating Visual 3: Strategic Market Segmentation Map...
  -> Saved to 3_market_segmentation_map.png
Generating Visual 4: Data Quality & Confidence Report...
  -> Saved to 4_sanity_check_dashboard.png

All professional visuals have been generated and saved.


In [61]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Advanced Visualization Suite for a Pricing Quant.

This script generates five advanced visuals that build a strategic narrative
for a business audience, moving from market overview to actionable opportunities.
"""

# ==============================================================================
# 0. INSTALL NECESSARY LIBRARIES
# ==============================================================================
# Before running, please ensure you have installed the required libraries.
# You can do this by running the following command in your terminal or command prompt:
# pip install pandas scikit-learn seaborn matplotlib plotly geopandas folium

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import plotly.express as px
import geopandas as gpd
import folium

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

print("All libraries imported successfully.")

# ==============================================================================
# 1. THEME & DATA LOADING CONFIGURATION (FROM PREVIOUS SCRIPT)
# ==============================================================================

# --- Define a cohesive, professional color palette ---
PALETTE = {
    'primary_blue': '#003f5c',
    'secondary_teal': '#4d908e',
    'accent_green': '#58b28a',
    'light_grey': '#f0f0f0',
    'medium_grey': '#a9a9a9',
    'dark_grey': '#333333'
}

# --- Refined Professional Plot Style ---
try:
    plt.rcParams.update({
        'figure.figsize': (16, 9),
        'font.family': 'Avenir',
        'font.sans-serif': ['Avenir', 'Gill Sans MT', 'Calibri', 'sans-serif'],
        'axes.titleweight': 'bold', 'axes.labelweight': 'bold',
        'axes.titlepad': 25, 'axes.labelpad': 15,
        'axes.titlesize': 24, 'axes.labelsize': 18,
        'xtick.labelsize': 14, 'ytick.labelsize': 14,
        'legend.fontsize': 14, 'axes.spines.top': False,
        'axes.spines.right': False, 'axes.edgecolor': PALETTE['medium_grey'],
        'grid.color': PALETTE['light_grey'], 'grid.linestyle': '-',
        'grid.linewidth': 1, 'text.color': PALETTE['dark_grey'],
        'axes.titlecolor': PALETTE['primary_blue'], 'axes.labelcolor': PALETTE['dark_grey']
    })
    sns.set_theme(style="whitegrid")
except Exception:
    print("Avenir font not found, falling back to default sans-serif.")
    sns.set_theme(style="whitegrid")


def load_and_clean_rental_data(data_path):
    """
    Loads the rental_data.csv, handles parsing issues, and cleans numeric columns.
    """
    try:
        df = pd.read_csv(data_path, header=1, index_col=0)
    except FileNotFoundError:
        print(f"FATAL ERROR: The file '{data_path}' was not found.")
        return None
    except Exception as e:
        print(f"Error reading {data_path}: {e}. Ensure header is on the second row.")
        return None

    df.columns = df.columns.str.strip()

    price_cols = ['Price', 'Maintenance', 'Total price']
    for col in price_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace(r'[₹,]', '', regex=True), errors='coerce')

    other_numeric_cols = ['Super Area', 'Floor', 'Total floors', '# of bathrooms', '# of balcony']
    for col in other_numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df.dropna(subset=['Total price', 'Super Area'], inplace=True)
    return df

# ==============================================================================
# 2. ADVANCED VISUALIZATION FUNCTIONS
# ==============================================================================

def plot_market_composition_treemap(data_path, save_path="5_market_composition_treemap.html"):
    """
    Visual 5: Generates an interactive treemap of the market composition.
    """
    print("Generating Visual 5: Market Composition Treemap...")
    df = load_and_clean_rental_data(data_path)
    if df is None: return

    try:
        # For simplicity, group less common furnishing types
        df['Furnishing_Group'] = df['Furnishing'].apply(lambda x: x if x in ['Semi', 'Full', 'None'] else 'Other')
        df['Price_in_Lakhs'] = df['Total price'] / 100000

        fig = px.treemap(df,
            path=[px.Constant("All Properties"), 'Furnishing_Group', '# of rooms'],
            values='Super Area',
            color='Price_in_Lakhs',
            color_continuous_scale='YlGnBu',
            title="Market Composition: Inventory by Furnishing and Property Type",
            hover_data={'Price_in_Lakhs':':.2f'}
        )
        fig.update_layout(
            title_font_size=24,
            font_family="Avenir, Gill Sans MT, Calibri, sans-serif",
            margin = dict(t=50, l=25, r=25, b=25)
        )
        fig.data[0].textinfo = 'label+value+percent root'
        fig.write_html(save_path)
        print(f"  -> Saved interactive treemap to {save_path}")
    except KeyError as e:
        print(f"  -> ERROR: Missing required column {e}. Available: {df.columns.tolist()}")
    except Exception as e:
        print(f"  -> An unexpected error occurred: {e}")

def plot_geo_heatmap(data_path, save_path="6_geo_heatmap.html"):
    """
    Visual 6: Generates a geographic heatmap of price per square foot.
    *** CRITICAL DATA REQUIREMENT: Your CSV must have a 'Locality' column. ***
    """
    print("Generating Visual 6: Geographic Rental Heatmap...")
    df = load_and_clean_rental_data(data_path)
    if df is None: return
    
    if 'Locality' not in df.columns:
        print("  -> FATAL ERROR for Visual 6: Your 'rental_data.csv' file MUST contain a column named 'Locality'.")
        print("  -> Please add locality information (e.g., 'Koramangala', 'Whitefield') and try again.")
        return

    try:
        df['Price_per_sqft'] = df['Total price'] / df['Super Area']
        locality_stats = df.groupby('Locality').agg(
            Avg_Price_SqFt=('Price_per_sqft', 'mean'),
            Listing_Count=('Locality', 'size')
        ).reset_index()

        # Download Bengaluru wards GeoJSON data
        geojson_url = "https://raw.githubusercontent.com/datameet/Municipal_Spatial_Data/master/Bengaluru/BBMP_Wards.json"
        print("  -> Downloading Bengaluru geospatial data...")
        geo_df = gpd.read_file(geojson_url)
        geo_df['WARD_NAME'] = geo_df['WARD_NAME'].str.title() # Standardize naming

        # Merge rental data with geo data
        # This is an imperfect match (Ward Name vs Locality) but demonstrates the technique.
        # A better match would use a more granular GeoJSON or pre-cleaned locality names.
        merged_df = geo_df.merge(locality_stats, left_on='WARD_NAME', right_on='Locality', how='left')

        # Create the map
        m = folium.Map(location=[12.9716, 77.5946], zoom_start=11, tiles='CartoDB positron')

        folium.Choropleth(
            geo_data=merged_df,
            name='choropleth',
            data=merged_df,
            columns=['WARD_NAME', 'Avg_Price_SqFt'],
            key_on='feature.properties.WARD_NAME',
            fill_color='YlGnBu',
            fill_opacity=0.7,
            line_opacity=0.2,
            legend_name='Average Price per Sq. Ft. (₹)',
            nan_fill_color='white'
        ).add_to(m)

        m.save(save_path)
        print(f"  -> Saved interactive map to {save_path}")
    except Exception as e:
        print(f"  -> An error occurred during map generation: {e}")
        print("  -> This may be due to network issues or missing libraries (geopandas, folium).")

def plot_marginal_price_impact(data_path, save_path="7_marginal_price_impact.png"):
    """
    Visual 7: Builds a model to quantify the price impact of key features.
    """
    print("Generating Visual 7: Marginal Price Impact Analysis...")
    df = load_and_clean_rental_data(data_path)
    if df is None: return

    try:
        # 1. Feature Engineering & Selection
        df['Floor_Ratio'] = (df['Floor'] / df['Total floors']).fillna(0.5)
        features = df[['Super Area', '# of bathrooms', 'Floor_Ratio', 'Facing', 'Total price']].copy()
        features.dropna(inplace=True)

        X = features.drop('Total price', axis=1)
        y = features['Total price']

        # 2. Preprocessing Pipeline
        numeric_features = ['Super Area', '# of bathrooms', 'Floor_Ratio']
        categorical_features = ['Facing']

        preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

        # 3. Model Pipeline
        model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Ridge(alpha=1.0))])
        model.fit(X, y)

        # 4. Estimate Feature Impacts (simplified for presentation)
        base_price = model.predict(X.head(1))[0]
        impacts = {}

        for col in ['# of bathrooms']:
            X_temp = X.head(1).copy()
            X_temp[col] += 1
            impacts[f"+1 {col.replace('_', ' ').title()}"] = model.predict(X_temp)[0] - base_price

        X_temp = X.head(1).copy()
        X_temp['Super Area'] += 100
        impacts["+100 Super Area (sq.ft)"] = model.predict(X_temp)[0] - base_price

        impact_df = pd.DataFrame(impacts.items(), columns=['Feature', 'Impact (₹)']).sort_values('Impact (₹)')

        # 5. Plotting
        fig, ax = plt.subplots(figsize=(14, 8))
        colors = [PALETTE['primary_blue'] if x > 0 else '#d62728' for x in impact_df['Impact (₹)']]
        ax.barh(impact_df['Feature'], impact_df['Impact (₹)'], color=colors)

        ax.set_title("What's a Feature Worth? Quantifying the Price Impact", loc='left')
        ax.set_xlabel("Estimated Impact on Monthly Rent (₹)")
        ax.set_ylabel("Feature Change")
        ax.axvline(0, color=PALETTE['medium_grey'], linestyle='--', linewidth=1)

        # Add labels to bars
        for i, v in enumerate(impact_df['Impact (₹)']):
            ax.text(v + (50 if v > 0 else -250), i, f"₹{v:,.0f}", color=PALETTE['dark_grey'], va='center', fontweight='bold')

        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  -> Saved to {save_path}")
    except Exception as e:
        print(f"  -> An error occurred during model training or plotting: {e}")

def plot_competitive_positioning_matrix(data_path, save_path="8_competitive_positioning.png"):
    """
    Visual 8: Plots societies on a 2x2 competitive positioning matrix.
    """
    print("Generating Visual 8: Competitive Positioning Matrix...")
    df = load_and_clean_rental_data(data_path)
    if df is None: return

    try:
        df['Price_per_sqft'] = df['Total price'] / df['Super Area']
        society_stats = df.groupby('Society Name').agg(
            Avg_Price_SqFt=('Price_per_sqft', 'mean'),
            Listing_Count=('Society Name', 'size'),
            Avg_Super_Area=('Super Area', 'mean')
        ).reset_index()

        # Filter out societies with very few listings to reduce noise
        society_stats = society_stats[society_stats['Listing_Count'] > 2]
        if society_stats.empty:
            print("  -> Skipping: Not enough societies with 3+ listings to analyze.")
            return

        fig, ax = plt.subplots(figsize=(18, 12))
        sns.scatterplot(
            data=society_stats,
            x='Listing_Count',
            y='Avg_Price_SqFt',
            size='Avg_Super_Area',
            sizes=(50, 1500),
            hue='Avg_Price_SqFt',
            palette='viridis',
            alpha=0.7,
            ax=ax,
            legend='brief'
        )

        # Quadrant lines
        median_price = society_stats['Avg_Price_SqFt'].median()
        median_listings = society_stats['Listing_Count'].median()
        ax.axhline(median_price, color=PALETTE['medium_grey'], linestyle='--', lw=1.5)
        ax.axvline(median_listings, color=PALETTE['medium_grey'], linestyle='--', lw=1.5)

        # Quadrant Annotations
        plt.text(0.97, 0.97, "Established Premium", transform=ax.transAxes, ha='right', va='top', fontsize=18, fontweight='bold', color=PALETTE['dark_grey'])
        plt.text(0.03, 0.97, "Niche Luxury", transform=ax.transAxes, ha='left', va='top', fontsize=18, fontweight='bold', color=PALETTE['dark_grey'])
        plt.text(0.97, 0.03, "Mass Market", transform=ax.transAxes, ha='right', va='bottom', fontsize=18, fontweight='bold', color=PALETTE['dark_grey'])
        plt.text(0.03, 0.03, "Emerging Value", transform=ax.transAxes, ha='left', va='bottom', fontsize=18, fontweight='bold', color=PALETTE['dark_grey'])
        
        ax.set_title("Competitive Landscape: Society Positioning Matrix", loc='left')
        ax.set_xlabel("Market Presence (Number of Listings)")
        ax.set_ylabel("Price Point (Average Price per Sq.Ft.)")
        ax.legend(title="Avg. Super Area")
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  -> Saved to {save_path}")
    except Exception as e:
        print(f"  -> An error occurred during plotting: {e}")

def plot_deal_finder_analysis(data_path, save_path="9_deal_finder_analysis.png"):
    """
    Visual 9: Compares actual rent vs. model-predicted rent to find deals.
    """
    print("Generating Visual 9: Deal-Finder Analysis...")
    df = load_and_clean_rental_data(data_path)
    if df is None: return

    try:
        # Re-run the same model as in Visual 7 to get predictions
        df['Floor_Ratio'] = (df['Floor'] / df['Total floors']).fillna(0.5)
        features = df[['Super Area', '# of bathrooms', '# of balcony', 'Floor_Ratio', 'Facing', 'Total price', 'Society Name', '# of rooms']].copy()
        features.dropna(inplace=True)

        X = features.drop('Total price', axis=1)
        y = features['Total price']

        numeric_features = ['Super Area', '# of bathrooms', '# of balcony', 'Floor_Ratio']
        categorical_features = ['Facing']
        
        preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['Facing']) # Drop other categoricals for model
        ], remainder='drop')

        model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Ridge(alpha=1.0))])
        model.fit(X.drop(['Society Name', '# of rooms'], axis=1), y)
        
        predictions = model.predict(X.drop(['Society Name', '# of rooms'], axis=1))
        features['Predicted_Price'] = predictions
        features['Price_Difference'] = features['Total price'] - features['Predicted_Price']
        features['Price_Difference_Percent'] = (features['Price_Difference'] / features['Predicted_Price']) * 100
        
        # Select top 10 undervalued and top 10 overvalued properties
        undervalued = features.sort_values('Price_Difference_Percent', ascending=True).head(10)
        overvalued = features.sort_values('Price_Difference_Percent', ascending=False).head(10)
        deal_df = pd.concat([undervalued, overvalued]).sort_values('Price_Difference_Percent')
        deal_df['Listing_Name'] = deal_df['Society Name'] + " (" + deal_df['# of rooms'] + ")"
        
        # Plotting
        fig, ax = plt.subplots(figsize=(16, 10))
        colors = [PALETTE['accent_green'] if x < 0 else '#d62728' for x in deal_df['Price_Difference_Percent']]
        
        ax.barh(deal_df['Listing_Name'], deal_df['Price_Difference_Percent'], color=colors)
        ax.set_title("Opportunity Radar: Undervalued vs. Overvalued Listings", loc='left')
        ax.set_xlabel("Price Difference from Predicted Fair Value (%)")
        ax.set_ylabel("Property Listing")
        ax.axvline(0, color=PALETTE['medium_grey'], linestyle='--', linewidth=1)
        
        # Add annotation labels
        ax.text(0.25, 1.02, 'Potential Deals (Undervalued)', transform=ax.transAxes, ha='center', color=PALETTE['accent_green'], fontsize=14, fontweight='bold')
        ax.text(0.75, 1.02, 'Premium Priced (Overvalued)', transform=ax.transAxes, ha='center', color='#d62728', fontsize=14, fontweight='bold')
        
        formatter = mticker.FuncFormatter(lambda x, p: f'{x:.0f}%')
        ax.xaxis.set_major_formatter(formatter)

        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  -> Saved to {save_path}")
    except Exception as e:
        print(f"  -> An error occurred during deal-finder analysis: {e}")


# ==============================================================================
# 3. MAIN EXECUTION SCRIPT
# ==============================================================================
if __name__ == "__main__":
    print("\n" + "="*80)
    print("      STARTING ADVANCED VISUALIZATION GENERATION FOR PRICING QUANT      ")
    print("="*80 + "\n")

    # Define path to the source CSV file
    original_data_path = 'rental_data.csv'

    # --- Run all five advanced visualization functions ---
    plot_market_composition_treemap(original_data_path)
    plot_geo_heatmap(original_data_path)
    plot_marginal_price_impact(original_data_path)
    plot_competitive_positioning_matrix(original_data_path)
    plot_deal_finder_analysis(original_data_path)

    print("\n" + "="*80)
    print("   All advanced visuals have been generated and saved. Review the HTML and PNG files.  ")
    print("="*80)

All libraries imported successfully.

      STARTING ADVANCED VISUALIZATION GENERATION FOR PRICING QUANT      

Generating Visual 5: Market Composition Treemap...
  -> Saved interactive treemap to 5_market_composition_treemap.html
Generating Visual 6: Geographic Rental Heatmap...
  -> FATAL ERROR for Visual 6: Your 'rental_data.csv' file MUST contain a column named 'Locality'.
  -> Please add locality information (e.g., 'Koramangala', 'Whitefield') and try again.
Generating Visual 7: Marginal Price Impact Analysis...
  -> Saved to 7_marginal_price_impact.png
Generating Visual 8: Competitive Positioning Matrix...
  -> Saved to 8_competitive_positioning.png
Generating Visual 9: Deal-Finder Analysis...
  -> Saved to 9_deal_finder_analysis.png

   All advanced visuals have been generated and saved. Review the HTML and PNG files.  
