In [None]:
pip install apscheduler cloudscraper pandas beautifulsoup4



In [None]:
import cloudscraper
import time
import csv
from bs4 import BeautifulSoup

# Define the base URL
base_url = "https://www.realtor.com/realestateagents/columbus_oh?page={}"

# Create a CloudScraper session
scraper = cloudscraper.create_scraper()

agents_data = []
page = 1  # Start from the first page
max_pages = 200  # Limit to 200 pages

while page <= max_pages:
    url = base_url.format(page)
    print(f"Scraping page {page}: {url}")  # Debugging: Show current page

    response = scraper.get(url)
    if response.status_code != 200:
        print("Error: Failed to retrieve page, stopping.")
        break  # Stop if there's an error (e.g., no more pages)

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all agent cards on the page
    realtor_cards = soup.find_all("div", class_="jsx-3873707352 agent-list-card clearfix")

    if not realtor_cards:  # Stop when no more agent cards are found
        print("No more agents found, stopping.")
        break

    for realtor_card in realtor_cards:
        # Extract Name
        name_tag = realtor_card.find("span", class_="text-bold")
        name = name_tag.text.strip() if name_tag else "N/A"

        # Extract Agency Name
        agency_tag = realtor_card.find("div", class_="agent-group")
        agency = agency_tag.text.strip() if agency_tag else "N/A"

        # Extract Experience
        experience_tag = realtor_card.find("span", class_="bold-text")
        experience = experience_tag.text.strip() if experience_tag else "N/A"

        # Extract GCI (Activity Range)
        gci_container = realtor_card.find("div", class_="jsx-3873707352 agent-detail-item")
        gci_tag = gci_container.find("span", class_="jsx-3873707352 bold-text") if gci_container else None
        gci = gci_tag.text.strip() if gci_tag else "N/A"

        # Append agent data to list
        agents_data.append({
            "Name": name,
            "Agency": agency,
            "Experience": experience,
            "GCI (Activity Range)": gci
        })

    page += 1  # Move to the next page
    time.sleep(2)  # Delay to prevent blocking

# Save to CSV
csv_filename = "real_estate_agents.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Name", "Agency", "Experience", "GCI (Activity Range)"])
    writer.writeheader()
    writer.writerows(agents_data)

print(f"Scraping completed! Data saved to {csv_filename}")


Scraping page 1: https://www.realtor.com/realestateagents/columbus_oh?page=1
Scraping page 2: https://www.realtor.com/realestateagents/columbus_oh?page=2
Scraping page 3: https://www.realtor.com/realestateagents/columbus_oh?page=3
Scraping page 4: https://www.realtor.com/realestateagents/columbus_oh?page=4
Scraping page 5: https://www.realtor.com/realestateagents/columbus_oh?page=5
Scraping page 6: https://www.realtor.com/realestateagents/columbus_oh?page=6
Scraping page 7: https://www.realtor.com/realestateagents/columbus_oh?page=7
Scraping page 8: https://www.realtor.com/realestateagents/columbus_oh?page=8
Scraping page 9: https://www.realtor.com/realestateagents/columbus_oh?page=9
Scraping page 10: https://www.realtor.com/realestateagents/columbus_oh?page=10
Scraping page 11: https://www.realtor.com/realestateagents/columbus_oh?page=11
Scraping page 12: https://www.realtor.com/realestateagents/columbus_oh?page=12
Scraping page 13: https://www.realtor.com/realestateagents/columbus_oh

# **preprocessing**

### missing values and duplicate value processing
### add rating based on GCI



In [None]:
import pandas as pd
import numpy as np

# Load CSV file
csv_filename = "/content/real_estate_agents.csv"  # Update with actual file
df = pd.read_csv(csv_filename)

# Display initial data
print("Initial Data:")
print(df.head())

# Rename columns properly
df.rename(columns={"GCI (Activity Range)": "GCI"}, inplace=True)

# Ensure column names are correct
df.columns = df.columns.str.strip()

# Convert GCI to numerical values
def extract_gci(gci):
    if isinstance(gci, str):
        gci = gci.strip()  # Remove extra spaces
        if "$" in gci:
            try:
                amounts = [float(a.replace("$", "").replace("K", "000").replace("M", "000000")) for a in gci.split("-")]
                return sum(amounts) / len(amounts)  # Average of min-max range
            except:
                return np.nan  # Return NaN if conversion fails
    return np.nan  # Return NaN if not a valid string

df["GCI"] = df["GCI"].apply(extract_gci)

# Ensure "GCI" column is numeric
df["GCI"] = pd.to_numeric(df["GCI"], errors="coerce")

# Drop rows where GCI is still NaN
df.dropna(subset=["GCI"], inplace=True)

# ✅ Remove duplicate rows BEFORE assigning ratings
df = df.drop_duplicates(subset=["Name", "Agency", "Experience", "GCI"], keep="first")

# Create a synthetic "Rating" column based on GCI
def assign_rating(gci):
    gci = float(gci)  # Ensure GCI is a number
    if gci >= 1000000:
        return np.random.uniform(4.5, 5.0)  # Top agents get 4.5 - 5.0 rating
    elif gci >= 500000:
        return np.random.uniform(4.0, 4.5)
    elif gci >= 100000:
        return np.random.uniform(3.5, 4.0)
    elif gci >= 50000:
        return np.random.uniform(3.0, 3.5)
    else:
        return np.random.uniform(2.0, 3.0)  # Low GCI agents get 2.0 - 3.0

df["Rating"] = df["GCI"].apply(assign_rating)

# Round ratings to 1 decimal place
df["Rating"] = df["Rating"].round(1)

# ✅ Remove duplicates again (in case ratings caused minor differences)
df = df.drop_duplicates()

# Filter out irrelevant leads (Keep only those with GCI > 50K)
df = df[df["GCI"] > 50000]  # Fixed threshold

# Sort by GCI and Rating
df = df.sort_values(by=["GCI", "Rating"], ascending=[False, False])

# Save cleaned & filtered data to a new CSV file
filtered_csv_filename = "filtered_real_estate_agents.csv"
df.to_csv(filtered_csv_filename, index=False)

print(f"Data processing completed! Cleaned data saved to {filtered_csv_filename}")

# Show final processed data
print("\nProcessed Data:")
print(df.head())


Initial Data:
                    Name                          Agency         Experience  \
0     Julian M Mcclurkin                    Real of Ohio  4 years 11 months   
1   Phillip Warren Stern                       Realistar                 27   
2        Alexander Homer             RE/MAX LEADING EDGE           5 months   
3  Christopher Todd Boyd             RE MAX Leading Edge   9 years 5 months   
4       CAROLINE SHROYER  Keller Williams Consultants Re  29 years 5 months   

  GCI (Activity Range)  
0        $240K - $359K  
1         $10K - $409K  
2       $74.5K - $480K  
3       $59.9K - $480K  
4        $280K - $400K  
Data processing completed! Cleaned data saved to filtered_real_estate_agents.csv

Processed Data:
                     Name                                         Agency  \
19            Jason Gould                                   Red 1 Realty   
18           Amanda Smith  Howard Hanna Real Estate Services - Pataskala   
15  MARY MCMICHAEL-LISTON          