Data scraped from realtor.com, downloaded from: 
https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset/data

In [None]:
#import libraries
import pandas as pd
import numpy as np

In [None]:
#load data
df = pd.read_csv('realtor-data.csv')
df.head()

In [None]:
df.describe()

In [None]:
#filter the data and only include Texas listings
df = df[df['state'] == "Texas"]
df.head()

In [None]:
df.describe()

In [None]:
# list biggest brokers in texas
df['brokered_by'].value_counts().nlargest(150)

In [None]:
#df[df['brokered_by'] == 53016]

In [None]:
# exclude data where no price available
df = df.dropna(subset=['price'])

In [None]:
# create random dates
def random_dates(start, end, n, unit='D', seed=None):
    if not seed:
        np.random.seed(0)

    ndays = (end - start).days + 1
    return start + pd.to_timedelta(
        np.random.randint(0, ndays, n), unit=unit
    )

In [None]:
# initialize arguments to create random dates
start = pd.to_datetime('01-01-2024', utc=True, dayfirst=True)
end = pd.to_datetime('30-06-2024', utc=True, dayfirst=True)
rows = df.shape[0]
dates = random_dates(start, end, rows )

In [None]:
# assign random dates to dataset
# the random dates are assigned to column date_published, which is a publishing date for a listing for our dataset
df = df.assign(date_published =  dates)

In [None]:
# exclude houses over 20000 sqft
df = df[df['house_size'] <= 20000]

In [None]:
# assign a random sale date to data
def compute_date_sold(row):
    if row['status'] == 'for_sale':
        return 0
    else:
        random_days = np.random.randint(0, 61)  # Random number between 0 and 60
        return pd.to_datetime(row['date_published'] + pd.Timedelta(days=random_days))

# Apply function to create the new column
df['date_sold'] = df.apply(compute_date_sold, axis=1)
df['date_sold'] = pd.to_datetime(df['date_sold'], format='%Y-%m-%d', errors='coerce')

In [None]:
links_list =['https://www.zillow.com/homedetails/2902-Meridian-Bay-Ln-Dickinson-TX-77539/71472548_zpid/',
        'https://www.zillow.com/homedetails/3015-Misty-Isle-Ct-Dickinson-TX-77539/59823326_zpid/',
        'https://www.zillow.com/homedetails/203-Armand-Bay-Dr-Dickinson-TX-77539/50447427_zpid/',
        'https://www.zillow.com/homedetails/201-Creekside-Dr-League-City-TX-77573/27647595_zpid/',
        'https://www.zillow.com/homedetails/4508-Brookstone-Ln-League-City-TX-77573/50445179_zpid/']

In [None]:
# assign random links to listings

df['links'] = np.random.choice(links_list, size=len(df))
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

df.style.format({'links': make_clickable})

In [None]:
df.to_csv('real_estate_broker_data_texas.csv')

In [None]:
df.head()

In [None]:
df['date_sold'].dtype

In [None]:
df['price'].dtype

In [None]:
df_sample = df.sample(frac=0.5, random_state=1)
df.to_csv('sample_real_estate_broker_data_texas.csv')