In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import re
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [6]:


class RealEstateAnalyzer:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.sale_properties = pd.DataFrame()
        self.rental_properties = pd.DataFrame()
        self.analysis_results = pd.DataFrame()
    
    def get_sale_properties(self, postcode):
        """Scrape property sale listings for a given postcode"""
        properties = []
        page = 1
        
        print(f"Scraping sale properties for postcode {postcode}...")
        
        while True:
            url = f"https://www.realestate.com.au/buy/in-{postcode}/list-{page}"
            try:
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Check if we've reached the end of results
                no_results = soup.find('div', class_='noneMessage')
                if no_results or "no exact matches" in soup.text.lower():
                    if page == 1:
                        print(f"No sale properties found for postcode {postcode}")
                    break
                
                # Extract property data
                property_cards = soup.find_all('div', class_=lambda c: c and 'residential-card' in c)
                
                if not property_cards:
                    break
                
                for card in property_cards:
                    try:
                        # Extract address
                        address_elem = card.find('a', class_=lambda c: c and 'address' in c)
                        address = address_elem.text.strip() if address_elem else "N/A"
                        
                        # Extract price
                        price_elem = card.find('span', class_=lambda c: c and 'price' in c)
                        price_text = price_elem.text.strip() if price_elem else "N/A"
                        
                        # Extract property details
                        details_elem = card.find('div', class_=lambda c: c and 'property-features' in c)
                        beds = baths = cars = "N/A"
                        land_size = "N/A"
                        
                        if details_elem:
                            feature_elements = details_elem.find_all('span', class_=lambda c: c and 'feature' in c)
                            for feature in feature_elements:
                                feature_text = feature.text.strip()
                                if 'bed' in feature_text.lower():
                                    beds = re.sub(r'[^\d.]', '', feature_text)
                                elif 'bath' in feature_text.lower():
                                    baths = re.sub(r'[^\d.]', '', feature_text)
                                elif 'car' in feature_text.lower() or 'park' in feature_text.lower():
                                    cars = re.sub(r'[^\d.]', '', feature_text)
                                elif 'm²' in feature_text.lower():
                                    land_size = re.sub(r'[^\d.]', '', feature_text)
                        
                        # Extract property URL
                        property_url = "https://www.realestate.com.au" + address_elem['href'] if address_elem and 'href' in address_elem.attrs else "N/A"
                        
                        # Extract property type
                        property_type = "N/A"
                        type_elem = card.find('span', class_=lambda c: c and 'property-type' in c)
                        if type_elem:
                            property_type = type_elem.text.strip()
                        
                        # Process price
                        clean_price = None
                        if price_text != "N/A":
                            # Extract numeric price if available
                            if any(x in price_text.lower() for x in ['$', 'aud', 'k', 'm']):
                                clean_price_text = re.sub(r'[^\d.]', '', price_text)
                                if 'k' in price_text.lower():
                                    clean_price = float(clean_price_text) * 1000
                                elif 'm' in price_text.lower():
                                    clean_price = float(clean_price_text) * 1000000
                                else:
                                    clean_price = float(clean_price_text) if clean_price_text else None
                        
                        properties.append({
                            'address': address,
                            'postcode': postcode,
                            'price_text': price_text,
                            'price': clean_price,
                            'beds': beds,
                            'baths': baths,
                            'cars': cars,
                            'land_size': land_size,
                            'property_type': property_type,
                            'url': property_url
                        })
                    except Exception as e:
                        print(f"Error extracting property data: {e}")
                
                page += 1
                
                # Random delay to avoid getting blocked
                time.sleep(random.uniform(1, 3))
                
            except Exception as e:
                print(f"Error fetching page {page} for postcode {postcode}: {e}")
                break
        
        return pd.DataFrame(properties)
    
    def get_rental_properties(self, postcode):
        """Scrape property rental listings for a given postcode"""
        properties = []
        page = 1
        
        print(f"Scraping rental properties for postcode {postcode}...")
        
        while True:
            url = f"https://www.realestate.com.au/rent/in-{postcode}/list-{page}"
            try:
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Check if we've reached the end of results
                no_results = soup.find('div', class_='noneMessage')
                if no_results or "no exact matches" in soup.text.lower():
                    if page == 1:
                        print(f"No rental properties found for postcode {postcode}")
                    break
                
                # Extract property data
                property_cards = soup.find_all('div', class_=lambda c: c and 'residential-card' in c)
                
                if not property_cards:
                    break
                
                for card in property_cards:
                    try:
                        # Extract address
                        address_elem = card.find('a', class_=lambda c: c and 'address' in c)
                        address = address_elem.text.strip() if address_elem else "N/A"
                        
                        # Extract price
                        price_elem = card.find('span', class_=lambda c: c and 'price' in c)
                        price_text = price_elem.text.strip() if price_elem else "N/A"
                        
                        # Extract property details
                        details_elem = card.find('div', class_=lambda c: c and 'property-features' in c)
                        beds = baths = cars = "N/A"
                        
                        if details_elem:
                            feature_elements = details_elem.find_all('span', class_=lambda c: c and 'feature' in c)
                            for feature in feature_elements:
                                feature_text = feature.text.strip()
                                if 'bed' in feature_text.lower():
                                    beds = re.sub(r'[^\d.]', '', feature_text)
                                elif 'bath' in feature_text.lower():
                                    baths = re.sub(r'[^\d.]', '', feature_text)
                                elif 'car' in feature_text.lower() or 'park' in feature_text.lower():
                                    cars = re.sub(r'[^\d.]', '', feature_text)
                        
                        # Extract property URL
                        property_url = "https://www.realestate.com.au" + address_elem['href'] if address_elem and 'href' in address_elem.attrs else "N/A"
                        
                        # Extract property type
                        property_type = "N/A"
                        type_elem = card.find('span', class_=lambda c: c and 'property-type' in c)
                        if type_elem:
                            property_type = type_elem.text.strip()
                        
                        # Process price - convert to weekly if showing as monthly
                        clean_price = None
                        weekly_price = None
                        
                        if price_text != "N/A":
                            if 'pw' in price_text.lower() or '/wk' in price_text.lower() or 'week' in price_text.lower():
                                clean_price_text = re.sub(r'[^\d.]', '', price_text)
                                weekly_price = float(clean_price_text) if clean_price_text else None
                            elif 'pcm' in price_text.lower() or '/month' in price_text.lower() or 'month' in price_text.lower():
                                clean_price_text = re.sub(r'[^\d.]', '', price_text)
                                monthly_price = float(clean_price_text) if clean_price_text else None
                                weekly_price = monthly_price * 12 / 52 if monthly_price else None
                            else:
                                # Default to assuming weekly if not specified
                                clean_price_text = re.sub(r'[^\d.]', '', price_text)
                                weekly_price = float(clean_price_text) if clean_price_text else None
                        
                        properties.append({
                            'address': address,
                            'postcode': postcode,
                            'price_text': price_text,
                            'weekly_rent': weekly_price,
                            'annual_rent': weekly_price * 52 if weekly_price else None,
                            'beds': beds,
                            'baths': baths,
                            'cars': cars,
                            'property_type': property_type,
                            'url': property_url
                        })
                    except Exception as e:
                        print(f"Error extracting rental property data: {e}")
                
                page += 1
                
                # Random delay to avoid getting blocked
                time.sleep(random.uniform(1, 3))
                
            except Exception as e:
                print(f"Error fetching rental page {page} for postcode {postcode}: {e}")
                break
        
        return pd.DataFrame(properties)
    
    def preprocess_data(self):
        """Clean and prepare the data for analysis"""
        # Convert beds, baths, cars to numeric
        for df in [self.sale_properties, self.rental_properties]:
            for col in ['beds', 'baths', 'cars']:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        if 'land_size' in self.sale_properties.columns:
            self.sale_properties['land_size'] = pd.to_numeric(self.sale_properties['land_size'], errors='coerce')
        
        # Filter out properties with missing or invalid price data
        self.sale_properties = self.sale_properties.dropna(subset=['price'])
        self.rental_properties = self.rental_properties.dropna(subset=['weekly_rent'])
        
        # Create standardized address format for matching
        for df in [self.sale_properties, self.rental_properties]:
            df['address_matching'] = df['address'].str.lower().str.replace(r'\s+', ' ', regex=True)
    
    def match_properties(self):
        """Match rental and sale properties based on address and characteristics"""
        matched_properties = []
        
        # Group properties by number of bedrooms and property type
        sale_groups = self.sale_properties.groupby(['beds', 'property_type'])
        rental_groups = self.rental_properties.groupby(['beds', 'property_type'])
        
        common_keys = set(sale_groups.groups.keys()).intersection(set(rental_groups.groups.keys()))
        
        for key in common_keys:
            sale_properties_group = sale_groups.get_group(key)
            rental_properties_group = rental_groups.get_group(key)
            
            # Use median rental price for this property type/beds combo
            median_weekly_rent = rental_properties_group['weekly_rent'].median()
            median_annual_rent = median_weekly_rent * 52
            
            # Calculate yield for each sale property
            for _, sale_property in sale_properties_group.iterrows():
                # Calculate rental yield based on median rent for similar properties
                rental_yield = (median_annual_rent / sale_property['price']) * 100 if sale_property['price'] > 0 else np.nan
                
                matched_properties.append({
                    'address': sale_property['address'],
                    'postcode': sale_property['postcode'],
                    'beds': sale_property['beds'],
                    'baths': sale_property['baths'],
                    'property_type': sale_property['property_type'],
                    'sale_price': sale_property['price'],
                    'estimated_weekly_rent': median_weekly_rent,
                    'estimated_annual_rent': median_annual_rent,
                    'rental_yield': rental_yield,
                    'sale_url': sale_property['url'],
                })
        
        return pd.DataFrame(matched_properties)
    
    def analyze_undervalued_properties(self):
        """Identify potentially undervalued properties based on rental yield"""
        if self.analysis_results.empty:
            print("No matched properties found for analysis")
            return pd.DataFrame()
        
        # Calculate statistics by property type and bedrooms
        stats = self.analysis_results.groupby(['property_type', 'beds']).agg({
            'rental_yield': ['mean', 'median', 'std', 'count'],
            'sale_price': ['mean', 'median', 'min', 'max'],
            'estimated_weekly_rent': ['mean', 'median']
        }).reset_index()
        
        # Identify properties with above-average rental yield
        undervalued = pd.DataFrame()
        
        for prop_type in self.analysis_results['property_type'].unique():
            for bed_count in self.analysis_results[self.analysis_results['property_type'] == prop_type]['beds'].unique():
                # Get properties of this type and bedroom count
                group = self.analysis_results[(self.analysis_results['property_type'] == prop_type) & 
                                             (self.analysis_results['beds'] == bed_count)]
                
                if len(group) > 1:  # Need at least a few properties to make a comparison
                    median_yield = group['rental_yield'].median()
                    std_yield = group['rental_yield'].std()
                    
                    # Properties with yield > median + 0.5*std are considered potentially undervalued
                    threshold = median_yield + 0.5 * std_yield
                    
                    high_yield_properties = group[group['rental_yield'] > threshold].copy()
                    high_yield_properties['median_yield'] = median_yield
                    high_yield_properties['yield_threshold'] = threshold
                    high_yield_properties['yield_percentile'] = high_yield_properties['rental_yield'].rank(pct=True) * 100
                    
                    undervalued = pd.concat([undervalued, high_yield_properties])
        
        # Sort by rental yield in descending order
        undervalued = undervalued.sort_values(by='rental_yield', ascending=False).reset_index(drop=True)
        
        return undervalued
    
    def process_postcode(self, postcode):
        """Process a single postcode - scrape data and perform analysis"""
        # Get sale and rental properties
        self.sale_properties = self.get_sale_properties(postcode)
        self.rental_properties = self.get_rental_properties(postcode)
        
        # Check if we have enough data
        if len(self.sale_properties) == 0 or len(self.rental_properties) == 0:
            print(f"Insufficient data for postcode {postcode}. Sale properties: {len(self.sale_properties)}, Rental properties: {len(self.rental_properties)}")
            return
        
        # Preprocess the data
        self.preprocess_data()
        
        # Match properties and calculate yields
        self.analysis_results = self.match_properties()
        
        # Find undervalued properties
        self.undervalued_properties = self.analyze_undervalued_properties()
        
        return {
            'sale_properties': self.sale_properties,
            'rental_properties': self.rental_properties,
            'analysis_results': self.analysis_results,
            'undervalued_properties': self.undervalued_properties
        }
    
    def generate_reports(self):
        """Generate summary reports and visualizations"""
        if self.analysis_results.empty:
            print("No data available for reports")
            return
        
        # Summary statistics by property type
        summary_by_type = self.analysis_results.groupby('property_type').agg({
            'sale_price': ['median', 'mean', 'count'],
            'estimated_weekly_rent': ['median', 'mean'],
            'rental_yield': ['median', 'mean', 'min', 'max']
        })
        
        print("\n--- Summary by Property Type ---")
        print(summary_by_type)
        
        # Summary statistics by bedroom count
        summary_by_beds = self.analysis_results.groupby('beds').agg({
            'sale_price': ['median', 'mean', 'count'],
            'estimated_weekly_rent': ['median', 'mean'],
            'rental_yield': ['median', 'mean', 'min', 'max']
        })
        
        print("\n--- Summary by Bedroom Count ---")
        print(summary_by_beds)
        
        # Print top undervalued properties
        if not self.undervalued_properties.empty:
            print("\n--- Top 10 Potentially Undervalued Properties ---")
            top_properties = self.undervalued_properties.head(10)
            for i, (_, prop) in enumerate(top_properties.iterrows(), 1):
                print(f"{i}. {prop['address']} - {prop['property_type']} - {prop['beds']} bed")
                print(f"   Sale Price: ${prop['sale_price']:,.0f}")
                print(f"   Est. Weekly Rent: ${prop['estimated_weekly_rent']:.0f}")
                print(f"   Rental Yield: {prop['rental_yield']:.2f}% (vs median {prop['median_yield']:.2f}%)")
                print(f"   URL: {prop['sale_url']}")
                print()
        
        # Visualizations
        try:
            # Set plot style
            plt.style.use('seaborn-v0_8-whitegrid')
            
            # Plot 1: Distribution of Rental Yields
            plt.figure(figsize=(10, 6))
            sns.histplot(self.analysis_results['rental_yield'].dropna(), kde=True, bins=15)
            plt.title('Distribution of Rental Yields')
            plt.xlabel('Rental Yield (%)')
            plt.ylabel('Frequency')
            plt.axvline(self.analysis_results['rental_yield'].median(), color='r', linestyle='--', 
                       label=f'Median Yield: {self.analysis_results["rental_yield"].median():.2f}%')
            plt.legend()
            plt.tight_layout()
            plt.savefig('rental_yield_distribution.png')
            print("Saved rental yield distribution chart to 'rental_yield_distribution.png'")
            
            # Plot 2: Rental Yield by Property Type and Bedrooms
            plt.figure(figsize=(12, 7))
            plot_data = self.analysis_results[['property_type', 'beds', 'rental_yield']].dropna()
            sns.boxplot(x='property_type', y='rental_yield', hue='beds', data=plot_data)
            plt.title('Rental Yield by Property Type and Bedrooms')
            plt.xlabel('Property Type')
            plt.ylabel('Rental Yield (%)')
            plt.xticks(rotation=45)
            plt.legend(title='Bedrooms')
            plt.tight_layout()
            plt.savefig('rental_yield_by_type_and_beds.png')
            print("Saved rental yield by property type chart to 'rental_yield_by_type_and_beds.png'")
            
            # Plot 3: Price vs. Estimated Rent Scatterplot
            plt.figure(figsize=(10, 6))
            sns.scatterplot(data=self.analysis_results, x='sale_price', y='estimated_annual_rent', 
                           hue='property_type', size='beds', sizes=(50, 200), alpha=0.7)
            plt.title('Property Price vs. Estimated Annual Rent')
            plt.xlabel('Sale Price ($)')
            plt.ylabel('Estimated Annual Rent ($)')
            plt.tight_layout()
            plt.savefig('price_vs_rent_scatter.png')
            print("Saved price vs. rent scatterplot to 'price_vs_rent_scatter.png'")
            
        except Exception as e:
            print(f"Error generating visualizations: {e}")
    
    def export_data(self, postcode):
        """Export the analysis results to CSV files"""
        try:
            # Export sale properties
            if not self.sale_properties.empty:
                self.sale_properties.to_csv(f'sale_properties_{postcode}.csv', index=False)
                print(f"Exported sale properties data to 'sale_properties_{postcode}.csv'")
            
            # Export rental properties
            if not self.rental_properties.empty:
                self.rental_properties.to_csv(f'rental_properties_{postcode}.csv', index=False)
                print(f"Exported rental properties data to 'rental_properties_{postcode}.csv'")
            
            # Export analysis results
            if not self.analysis_results.empty:
                self.analysis_results.to_csv(f'property_analysis_{postcode}.csv', index=False)
                print(f"Exported property analysis data to 'property_analysis_{postcode}.csv'")
            
            # Export undervalued properties
            if not self.undervalued_properties.empty:
                self.undervalued_properties.to_csv(f'undervalued_properties_{postcode}.csv', index=False)
                print(f"Exported undervalued properties data to 'undervalued_properties_{postcode}.csv'")
                
        except Exception as e:
            print(f"Error exporting data: {e}")


def main():
    analyzer = RealEstateAnalyzer()
    
    print("===== Real Estate Property Analyzer =====")
    print("This tool extracts property data from realestate.com.au, analyzes rental yields,")
    print("and identifies potentially undervalued properties for investment.")
    print("\nNOTE: This is for educational purposes only. Using web scraping for commercial")
    print("purposes may violate realestate.com.au's terms of service.")
    print("======================================\n")
    
    while True:
        postcode = input("Enter a postcode to analyze (or 'quit' to exit): ")
        
        if postcode.lower() in ['quit', 'exit', 'q']:
            break
        
        if not postcode.isdigit() or len(postcode) != 4:
            print("Please enter a valid 4-digit Australian postcode.")
            continue
        
        try:
            print(f"\nProcessing postcode {postcode}...")
            results = analyzer.process_postcode(postcode)
            
            if results:
                # Generate and display reports
                analyzer.generate_reports()
                
                # Export data to CSV
                export_option = input("\nDo you want to export the data to CSV files? (y/n): ")
                if export_option.lower() == 'y':
                    analyzer.export_data(postcode)
            
        except Exception as e:
            print(f"An error occurred: {e}")
        
        print("\n-------------------------------------------\n")


if __name__ == "__main__":
    main()

===== Real Estate Property Analyzer =====
This tool extracts property data from realestate.com.au, analyzes rental yields,
and identifies potentially undervalued properties for investment.

NOTE: This is for educational purposes only. Using web scraping for commercial
purposes may violate realestate.com.au's terms of service.


Processing postcode 2137...
Scraping sale properties for postcode 2137...
Error fetching page 1 for postcode 2137: 429 Client Error: Too Many Requests for url: https://www.realestate.com.au/buy/in-2137/list-1
Scraping rental properties for postcode 2137...
Error fetching rental page 1 for postcode 2137: 429 Client Error: Too Many Requests for url: https://www.realestate.com.au/rent/in-2137/list-1
Insufficient data for postcode 2137. Sale properties: 0, Rental properties: 0

-------------------------------------------


Processing postcode 2137...
Scraping sale properties for postcode 2137...
Error fetching page 1 for postcode 2137: 429 Client Error: Too Many Req

In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

def calculate_gross_yield(annual_rent, property_value):
    """Calculate gross rental yield using formula: (annual_rent / property_value) * 100"""
    return (annual_rent / property_value) * 100 if property_value != 0 else 0

def get_property_yields(postcode):
    """Retrieve property data for a given postcode with improved error handling"""
    try:
        url = f"https://www.realestate.com.au/rent/in-{postcode}+nsw/list-1"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        properties = []
        for listing in soup.find_all('div', class_=re.compile(r'listingInfo')):
            try:
                # Extract price using regex pattern
                price_text = listing.find('p', {'data-testid': 'price'}).get_text()
                price = float(re.search(r'\$([\d,]+)', price_text).group(1).replace(',', ''))
                
                # Extract weekly rent
                rent_text = listing.find('p', string=re.compile(r'\$[\d,]+ per week')).get_text()
                rent = float(re.search(r'\$([\d,]+)', rent_text).group(1).replace(',', ''))
                
                properties.append({
                    'price': price,
                    'weekly_rent': rent,
                    'annual_rent': rent * 52,
                    'gross_yield': calculate_gross_yield(rent * 52, price)
                })
            except (AttributeError, ValueError, TypeError) as e:
                continue
        
        df = pd.DataFrame(properties)
        
        # Handle empty results
        if df.empty:
            print(f"No properties found for postcode {postcode}")
            return df
            
        return df[['price', 'weekly_rent', 'gross_yield']]  # Select specific columns after DataFrame creation
    
    except Exception as e:
        print(f"Error retrieving data: {str(e)}")
        return pd.DataFrame()

# Example usage with error handling
postcode_data = get_property_yields("2137")

if not postcode_data.empty:
    print(postcode_data.head())
else:
    print("No data available for the specified postcode")


Error retrieving data: 429 Client Error: Too Many Requests for url: https://www.realestate.com.au/rent/in-2137+nsw/list-1
No data available for the specified postcode


In [14]:
import requests
import random
import time

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    # Add more if you like
]

def get_with_retry(url, max_retries=5):
    retries = 0
    while retries < max_retries:
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        response = requests.get(url, headers=headers)
        if response.status_code == 429:
            wait = 2 ** retries
            print(f"429 received. Waiting {wait} seconds before retry...")
            time.sleep(wait)
            retries += 1
        else:
            response.raise_for_status()
            return response
    raise Exception("Max retries reached. Still getting 429 errors.")

# Example usage:
url = "https://www.realestate.com.au/rent/in-2137+nsw/list-1"
try:
    response = get_with_retry(url)
    print("Success!")
    # Continue with your BeautifulSoup parsing...
except Exception as e:
    print(f"Failed to retrieve data: {e}")


429 received. Waiting 1 seconds before retry...
429 received. Waiting 2 seconds before retry...
429 received. Waiting 4 seconds before retry...
429 received. Waiting 8 seconds before retry...
429 received. Waiting 16 seconds before retry...
Failed to retrieve data: Max retries reached. Still getting 429 errors.


NameError: name 'response' is not defined