In [2]:
pip install python-geohash

Collecting python-geohash
  Downloading python-geohash-0.8.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-geohash
  Building wheel for python-geohash (setup.py) ... [?25l[?25hdone
  Created wheel for python-geohash: filename=python_geohash-0.8.5-cp310-cp310-linux_x86_64.whl size=41537 sha256=ed658e8691826ca6f7cd61c4a6b72a252d9b5c96958c430aa7dc5a91db771777
  Stored in directory: /root/.cache/pip/wheels/19/e8/74/3f800ffdbb57c27a3fee3a695c7009769356448837c1f4f899
Successfully built python-geohash
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5


In [42]:
import json
import pandas as pd
import numpy as np
import math
from geohash import encode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler


# Assuming you have the JSON data in a file named 'data.json' in your Google Drive
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify the path to your JSON file
file_path = '/content/drive/MyDrive/Colab Notebooks/3253/ontario_housing_cleaned.json'

# Load the JSON data from the file line by line
records = []
with open(file_path, 'r') as f:
    for line in f:
        try:
            data = json.loads(line) # Load each line as a separate JSON object
            record = data.get('data') # Use .get() to handle potential missing 'data' key
            if record is not None: # Check if 'data' exists and is not None
                features = {}
                for feature_group in record.get('features', []): # Handle cases where 'features' might be missing
                    for feature_category in feature_group.get('value', []):
                        for feature in feature_category.get('value', []):
                            features[f"{feature_group['name']}_{feature_category['name']}_{feature['name']}"] = feature['value']

                record.update(features)  # Add the extracted features to the record
                record.pop('features', None)  # Remove the original nested features structure if it exists
                records.append(record)
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {line}") # Log any lines that fail to parse

# Create the DataFrame
df = pd.DataFrame(records)

# Save the DataFrame to a CSV file
#df.to_csv('/content/drive/MyDrive/Colab Notebooks/3253/ontario_housing_cleaned.csv', index=False)


# Display the DataFrame
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                         address subPremise  \
0  2480 Prince Michael Dr S #210       #210   
1                 1176 Grange Rd              
2                 3511 Post Road              
3            95 Dundas St W #513       #513   
4          3137 William Rose Way              

                                         fullAddress       division      city  \
0  2480 Prince Michael Dr S #210 Oakville, ON L6H...  Halton Region  Oakville   
1                1176 Grange Rd Oakville, ON L6H 1P6  Halton Region  Oakville   
2                3511 Post Road Oakville, ON L6H 7W5                 Oakville   
3           95 Dundas St W #513 Oakville, ON L6M 5N4  Halton Region  Oakville   
4         3137 William Rose Way Oakville, ON L6H 0T1  Halton Region  Oakville   

   cityCode closePrice closeDate daysOnMovoto  daysOnMovotoRaw  ... is3DTour  \
0    2506.0       Non

In [43]:
# Drop the specified columns
# These features are either irrelevant for predicting price, redundant, or contain mostly null values.
columns_to_drop = ['tnImgPath', 'imagesURL', 'photos', 'id', 'listingCoAgent', 'listingAgent',
                   'listingAgentLicense', 'listingOfficePhone',
                   'mlsDbNumber', 'mlsSysid', 'mls', 'mlsNumber', 'openHouses',
                   'officeColistName', 'officeListName', 'officeListPhone', 'photoCount', 'price', 'priceSeo', 'propertyTypeValue', 'propertyTypeDisplayName'
                   'state', 'status', 'pageUrlWithoutDomain', 'houseRealStatus', 'listingOfficeDescription', 'hoafee', 'hoafeeRaw', 'dppInactiveOnActive',
                   'dppInactive', 'priceChanged', 'priceChangedDate', 'updatedTime', 'hiddenByComplianceRule', 'dateHidden', 'propertyId', 'visibility',
                   'permitAvm', 'modificationTimestamp', 'createdAt', 'propertyDateHidden', 'imageDownloaderStatus', 'onMarketDateTime', 'priceChangeAmount', 'thumbnail',
                   'photoCount1', 'virtualTourLink', 'fsa', 'dppurl', 'listingByMovoto', 'labelDisplayName', 'listingPriceFormat', 'comparableHomes',
                   'listDateLLFormat', 'listDateLLFormat', 'listDateFormat', 'listDateUTC', 'pricePerSqft', 'pricePerSqftRaw', 'pricePerSqftIntRaw', 'isFavorite',
                   'petiteImagePath', 'propertyTypeNameUrl', 'closePrice', 'closeDate', 'daysOnMovoto', 'cityCode', 'listDate',
                   'state', 'soldDate', 'isHotHome', 'isSold', 'isPriceReduced', 'label', 'labelclass', 'Amenities  Utilities_Other_Pets Allowed (YN)',
                   'Amenities  Utilities_Utility_Utilities', 'Exterior_Building_# Total Stories', 'Exterior_Building_Building Amenities',
                   'Exterior_Building_Building Amenities', 'Exterior_Other_Exterior Features', 'Exterior_Other_Fencing', 'Exterior_Building_Foundation',
                   'Exterior_Parking_# Garage Spaces', 'Exterior_Parking_Drive', 'Exterior_Parking_Garage Features', 'Exterior_Parking_Has Basement (YN)',
                   'Exterior_Parking_Has Garage (YN)', 'Exterior_Parking_Parking Desc', 'Exterior_Parking_Parking Spot #',
                   'Interior_Bathrooms_# Full Baths', 'Interior_Bathrooms_# Half Baths', 'Interior_Bathrooms_# Three-Quarter', 'Interior_Bathrooms_# Total Bathrooms',
                   'Interior_Bedrooms_Family Room Available', 'Interior_Flooring_Flooring', 'Interior_Interior_Appliances', 'Interior_Interior_Has Fireplace (YN)',
                   'Interior_Interior_Laundry Information', 'Interior_Other_Interior Features', 'Location_Community_Community', 'Location_Community_Community Features',
                   'Location_Community_County', 'Location_Location features_Area', 'Location_Location features_Subdivision', 'Location_Location features_View',
                   'Location_Location features_Water Body Name', 'Location_Location features_Water Body Type', 'Location_Location features_Water Source',
                   'Location_Location features_Zoning Description', 'Location_Other_Directions', 'Location_Schools_Elementary School', 'Location_Schools_High School',
                   'Location_Schools_Middle School', 'Location_Schools_School District', 'Lot Land Details_Lot Information_Exposure',
                   'Lot Land Details_Lot Information_FarmAgriculture', 'Lot Land Details_Lot Information_Lot Desc', 'Lot Land Details_Lot Information_Lot Size Units',
                   'Lot Land Details_Lot Information_Water Features', 'Lot Land Details_Lot Information_Water Frontage', 'Overview_Lot_Approx Lot Size (Range)',
                   'Overview_Other_Approx Age', 'Overview_Other_Is Gated Community (YN)', 'Overview_Other_Is Horse Property (YN)', 'Overview_Other_New Construction (YN)',
                   'Overview_Other_Year Built', 'Overview_Other_HOA', 'Overview_Property_Approx Square Feet (Range)', 'Overview_Property_MLS #',
                   'Overview_Property_Status', 'Overview_Property_Storage Unit (Locker)', 'Overview_Property_Virtual Tour', 'Overview_Taxes_Tax Year', 'Overview_Taxes_Taxes',
                   'Rooms_Rooms Information_Movotorooms', 'SOA_HOUSEKEEPING_ATTRS_LISTING_SOURCE_URL_Listing Source URL',
                   'SOA_HOUSEKEEPING_ATTRS_LISTING_TYPE_Listing Type Identifier', 'virtualLink', 'is3DTour', 'isPriceUp', 'priceChange', 'priceChangeFriendlyPrice',
                   'lastListPriceRaw', 'lastListPrice', 'pricePerAcre', 'pricePerAcreRaw', 'pricePerAcreIntRaw', 'subPremise', 'fullAddress', 'division', 'daysOnMovotoRaw',
                   'description', 'lotSizeRaw', 'sqftTotalRaw', 'neighborhoodN', 'numBathroomsRaw', 'numBedroomsRaw', 'priceRaw', 'propertyTypeName','propertyTypeDisplayName',
                   'yearBuiltRaw', 'totalMonthlyFee', 'neighborhoodNGeoId', 'isVOWListing', 'addressRaw', 'address2', 'lotSizeUnit', 'sqftTotalUnit',
                   'Amenities  Utilities_Utility_Sewer Septic', 'Amenities  Utilities_Heating  Cooling_Heat Type', 'Amenities  Utilities_Other_Has Pool (YN)',
                   'Amenities  Utilities_Utility_Utility_Sewer Septic', 'Exterior_Building_Construction Materials', 'Exterior_Building_Roof',
                   'Exterior_Other_Other Structures', 'Exterior_Parking_# Parking Spaces', 'Interior_Bathrooms_# Three-Quarter Baths',
                   'Interior_Bedrooms_# of Above Grade Bedrooms', 'Interior_Bedrooms_# of Below Grade Bedrooms', 'Interior_Bedrooms_# of Rooms',
                   'Interior_More rooms_# of Kitchens', 'Lot Land Details_Lot Information_Lot Dimensions', 'Overview_Lot_Lot Size (Acres)', 'Overview_Other_Maintenance Fee',
                   'Overview_Property_Basement Information', 'Overview_Property_Building Size ', 'Overview_Property_Property Sub Type',
                   'city', 'neighborhoodName', 'priceChangeRaw', 'address', 'zipCode', 'sqftTotal']
# Use errors='ignore' to avoid errors if a column doesn't exist, drop in-place
df.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

# Save the DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/Colab Notebooks/3253/ontario_housing_cleaned2.csv', index=False)




In [44]:
## Distance to lake calculations and adding them into the dataframe
def distance(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat, long)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_km : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d
shorepoints = [(43.337573, -79.769493), (43.325047, -79.792023), (43.346755, -79.758282), (43.352226, -79.751185),
              (43.362459, -79.737418), (43.366887, -79.729100), (43.385790, -79.712277), (43.399670, -79.700950),
              (43.419369, -79.683863), (43.451628, -79.654371), (43.467697, -79.640033), (43.486686, -79.617320),
              (43.517525, -79.601415), (43.538370, -79.594686), (43.562755, -79.564711), (43.576052, -79.543607),
              (43.594663, -79.503232), (43.625224, -79.478151), (43.630317, -79.433801), (43.632973, -79.407191),
              (43.272289, -79.919314), (43.276725, -79.860298), (43.271462, -79.833200), (43.251221, -79.757855),
          ]
distances = []
for id, row in df.iterrows():
    mindist =  distance((row["latitude"], row["longitude"]), shorepoints[0])
    for point in shorepoints:
        dist = distance((row["latitude"], row["longitude"]), point)
        if dist < mindist:
            mindist = dist
    distances.append(mindist)
df.insert(len(df.iloc[0]), "distance_to_lake", distances)


In [16]:
# Display the DataFrame
print(df.head(30).to_string())

     latitude  listPrice  longitude lotSize numBathrooms numBedrooms  parking   propertyType yearBuilt  garage Amenities  Utilities_Heating  Cooling_Cooling Amenities  Utilities_Heating  Cooling_Heat Source  distance_to_lake
0   43.497160     995000 -79.707940                    2           2      2.0  Single Family       ...     NaN                      Central air conditioning                                       Natural gas          6.383626
1   43.479693    1190000 -79.682036                    2           3      3.0  Single Family       ...     NaN                      Central air conditioning                                                            3.642386
2   43.494279    1450000 -79.742757                    5           5      2.0      TOWNHOUSE       ...     1.0                                   Central Air                                               Gas          8.565083
3   43.476904     579900 -79.732563                    1           1      1.0          Condo      20

In [None]:
## THIS CODE IS ONLY PRESENT TO SHOW HOW DATA WAS COLLECTED AND DOES NOT HAVE THE REQUIRED API KEY TO RUN
## Completing the dataset by adding missing entries of square footage using the description which includes square footage (usually) and using the AzureAI chatbot to analyze text and return the square footage or None.

# Connection to openai chatbot
deployment_name = "REDACTED"
client = AzureOpenAI(
    api_key= "REDACTED", ## I am not allowed to share this API key for legal reasons but it was used to message the Azure AI chatbot
    api_version="2024-02-01",
    azure_endpoint = "REDACTED"
    )

def message_chatbot(message, description):

    # Uses the connecetion made to make a request to the Azure openai chatbot
    response = client.chat.completions.create(
        model=deployment_name,
        temperature=0.7,
        max_tokens=400,

        # Both messages to be submitted
        messages=[
            {"role": "system", "content": message},
            {"role": "user", "content": description}
        ]
    )
    generated_text = response.choices[0].message.content

    return (generated_text)

message = "The following will be a description of a house, Please find the total square footage of the house and return it as a single number without any commas or measurements. Do not return any other text other than the number itself, without any math equations or explanations, just a number. If there is no square footage found return 'None'"
sqfootage = []
for id, row in df_cleaned.iterrows():
    if row["sqftTotalRaw"] == 0 or row["sqftTotalRaw"] == None:
        sqfootage.append(message_chatbot(message, row["description"]))
    else:
        sqfootage.append(row["sqftTotalRaw"])


In [45]:
## Because the above code does not work without the API (and I am not allowed to put it here), this is what it wouldve generated, sorry for the hardcoding :( to be fair this isn't part of the assignment and was extra work
## HARDCODING PRESENT DUE TO NOT HAVING THE API KEY
sqfootage = [
'1200',
'None', 'None', '600', '3993', '2816', 'None', '1350', '3180', '1075', '1824', '2980', 'None', '3262', '1000', '1191', '4487', 'None', 'None', '1108', 'None',
'2900', '5186', 'None', '1479', '4000', '3100', '950', 'None', '2500', '5246', 'None', 'None', '3214', '1400', 'None', '2000', '3537', 'None', '1800', '3284',
'1762', '4303', '4100', '1936', '1017', '2617', '9900', '1000', '79954', '613', '1950', 'None', '2500', 'None', '1800', '1088', '1911', '2159', '1700', '4367',
'5293', '1200', '1918', '986', '717', '2000', '1146', '6000', 'None', 'None', 'None', 'None', '3860', '6984', 'None', '3800', '715', 'None', '1990', '7476',
'None', '610', 'None', 'None', '711', '1351', '1617', 'None', 'None', 'None', 'None', '2000', 'None', 'None', 'None', 'None', '1161', 'None', '5000', '2595',
'None', '2702', 'None', 'None', 'None', '2018', '660', 'None', '6568', 'None', 'None', 'None', '1700', 'None', '3300', '3108', '2720', 'None', 'None', 'None',
'None', 'None', 'None', 'None', '1075', '2805', '4600', '3614', '2500', 'None', '1710', '1285', '4655', 'None', '2944', '4038', 'None', 'None', '3285', 'None',
'3800', '2150', 'None', '1900', 'None', 'None', '2000', 'None', '1535', '3124', 'None', '845', 'None', '3234', 'None', '1400', 'None', 'None', 'None', '995',
'3100', 'None', '3873', 'None', '3886', '3400', '1783', 'None', 'None', '1900', 'None', '1557', '1783', '2918', '1300', 'None', '946', '592', 'None', 'None',
'None', '715', 'None', '3680', '3610', 'None', 'None', '2857', '1850', '3262', 'None', 'None', '2440', '2077', 'None', '4000', '6127', '479', 'None', '2734',
'2791', 'None', 'None', '1462', 'None', 'None', 'None', '840', 'None', 'None', '6772', 'None', 'None', '4401', '3550', 'None', '756', 'None', '4423', '2250',
'None', 'None', 'None', '3718', 'None', '9907', 'None', 'None', 'None', '1250', 'None', 'None', 'None', 'None', 'None', '3081', '2953', 'None', '962', '1535',
'None', 'None', '1700', '6400', 'None', '1250', '7494', 'None', '500', 'None', '2159', 'None', 'None', 'None', 'None', '528', '2150', '2406', 'None', 'None',
'None', '3143', 'None', '2856', '1900', '2789', 'None', '2767', '3000', '4364', 'None', '3921', '4000', '4680', 'None', '6416', '3430', '8750', '1000', '7500',
'922', '1790', '2981', '1351', 'None', 'None', 'None', 'None', '3500', 'None', '4200', '3500', '2326', 'None', '3000', 'None', '1288', 'None', '1800', '1774',
'None', '2000', '2150', '1170', '1408', '1', 'None', '5000', 'None', '1050', 'None', '2700', '2720', '1000', '3200', 'None', 'None', '2820', 'None', 'None',
'None', '3200', '630', '3550', '1548', 'None', '1084', '8000', '1455', 'None', '6568', 'None', '5470', '2500', 'None', '3368', '820', 'None', 'None', '2041',
'735', '1762', '4303', '854', 'None', '3400', 'None', '855', 'None', 'None', 'None', 'None', 'None', 'None', '2946', '1079', '4400', '1408', 'None', '6424',
'None', 'None', 'None', '1990', 'None', 'None', '2210', '2593', '735', 'None', 'None', '4600', '730', '5922', '2000', 'None', '5883', 'None', '3000', '7400',
'None', '1943', '997', '1783', '1780', '2247', 'None', '3716', '2243', '6470', '854', '1413', 'None', 'None', 'None', '6220', 'None', '1411', '1428', '1258',
'1780', 'None', '1954', 'None', '700', '1400', '1200', '655', '2200', 'None', 'None', 'None', '2744', '3700', '1880', 'None', '6829', '1050', '1615', 'None',
'None', 'None', 'None', '774', '1577', 'None', 'None', 'None', 'None', '700', 'None', '5000', '714', '2300', '6100', 'None', 'None', '1264', '1119', 'None',
'5745', '4279', 'None', '3558', 'None', '4728', 'None', '1321', '4364', 'None', 'None', 'None', 'None', '2691', '1830', '4000', 'None', '972', '980', '3019',
'None', 'None', 'None', '7715', '586', '4482', '900', 'None', '3800', '4000', '3600', 'None', '4867', '1113', '943', '620', '1250', 'None', 'None', '2971',
'2816', '1200', '1630', 'None', 'None', 'None', '2900', 'None', '625', 'None', 'None', '3173', 'None', 'None', '3815', 'None', '2056', '1000', 'None', '3815',
'None', '3316', '5000', '4000', '972', 'None', 'None', 'None', '1948', '4200', 'None', 'None', 'None', '2065', 'None', 'None', '3653', '1188', '2321', 'None',
'1201', 'None', '5700', 'None', '4054', '2728', '2807', '2704', 'None', '1866', 'None', '1700', '1501', 'None', '3325', '3800', '2459', '933', '2026', '629',
'None', 'None', '2790', '4200', '3180', '650', '2479', '613', '3700', 'None', '1830', 'None', '3999', 'None', 'None', 'None', 'None', 'None', '3605', 'None',
'None', '4750', '1700', '1450', '3000', '916', 'None', '4597', 'None', 'None', 'None', 'None', 'None', 'None', '2000', 'None', 'None', 'None', '13400', '1700',
'None', '469', 'None', '9413', 'None', '3741', '830', '1290', 'None', 'None', 'None', '2677', 'None', 'None', 'None', 'None', 'None', '13400', 'None', 'None',
'5400', '2500', 'None', 'None', 'None', 'None', 'None', '2064', '3447', 'None', '830', 'None', '4000', 'None', '839', 'None', 'None', '3880', '1586', 'None',
'1650', 'None', 'None', 'None', 'None', '1800', 'None', 'None', 'None', '1100', '1300', 'None', 'None', 'None', 'None', 'None', '2768', 'None', 'None', 'None',
'1500', '1600', 'None', 'None', 'None', '827', 'None', 'None', 'None', 'None', 'None', 'None', '1480', 'None', 'None', 'None', 'None', '1186', '7700', '1400',
'None', 'None', '640', '988', 'None', 'None', 'None', '576', 'None', 'None', 'None', 'None', '1000', 'None', '1655', 'None', 'None', 'None', 'None', 'None',
'4000', 'None', 'None', '600', '3000', '9334', 'None', '1000', '1300', 'None', 'None', '2676', '1300', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', '1800', 'None', '784', '590', 'None', '3111', '608', 'None', 'None', 'None', 'None', 'None', '1405', '841', '1087', 'None', 'None', 'None', '1136',
'None', '1100', '1870', 'None', 'None', '2777', 'None', 'None', 'None', 'None', 'None', 'None', '400', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', '3000', 'None', 'None', 'None', '756', '821', '1347', 'None', 'None', '1150', 'None', 'None', '5500', 'None', 'None', '910', '1300', 'None', 'None',
'1655', 'None', 'None', '1350', '1575', '566', 'None', 'None', '800', 'None', '1100', '948', '1100', '1203', '1186', 'None', 'None', 'None', '1032', 'None',
'None', '9334', '4410', '1179', '965', 'None', 'None', 'None', 'None', 'None', '965', 'None', 'None', 'None', 'None', '2651', 'None', 'None', 'None', 'None',
'1275', 'None', 'None', 'None', 'None', 'None', 'None', '560', '691', 'None', '3500', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', '4300', 'None', 'None', '600', 'None', '1500', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '3000', 'None', 'None', 'None', 'None', 'None',
'None', 'None', '2924', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '2500', '723', 'None', '4126', '630', 'None', 'None', 'None', 'None', '800',
'None', 'None', 'None', '821', 'None', 'None', '562', 'None', 'None', 'None', '1685', '846', '3000', 'None', '1933', 'None', 'None', 'None', 'None', '5000',
'4400', 'None', 'None', 'None', 'None', '1100', '4760', '2986', '836', 'None', 'None', 'None', 'None', 'None', 'None', '560', '3638', '1136', 'None', 'None',
'None', 'None', '4000', 'None', '910', 'None', '982', '3282', 'None', 'None', '700', 'None', 'None', 'None', 'None', 'None', '3063', '3631', '2300', 'None',
'None', 'None', '723', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1050', '576', 'None', 'None', '940', 'None', 'None',
'None', 'None', 'None', 'None', '1370', 'None', '5500', '2636', 'None', '1138', 'None', '800', 'None', 'None', '1138', 'None', 'None', 'None', '716', 'None',
'None', 'None', 'None', '3705', '1360', '3296', 'None', 'None', 'None', 'None', 'None', 'None', '540', 'None', '1700', 'None', 'None', 'None', 'None', '562',
'953', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1500', 'None', 'None', '703', '4000', 'None', '2360', 'None', 'None',
'2020', '1347', 'None', 'None', 'None', '1084', 'None', '1100', 'None', 'None', 'None', 'None', '580', '1174', 'None', 'None', '982', 'None', 'None', '728',
'None', '810', '795', 'None', 'None', '810', 'None', 'None', '1690', 'None', 'None', '1625', 'None', '6500', '2500', 'None', 'None', '784', 'None', '3073',
'None', 'None', 'None', 'None', '756', '1032', '2000', 'None', 'None', 'None', 'None', 'None', '2300', 'None', '700', 'None', '2215', '1179', 'None', 'None',
'None', '630', '2300', '1290', '1000', '3000', '799', 'None', 'None', 'None', '2064', 'None', 'None', '4400', 'None', 'None', '1343', 'None', 'None', 'None',
'None', 'None', 'None', '1150', '562', '566', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1200', '875', 'None', 'None', '948', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', '1041', '1655', 'None', 'None', 'None', 'None', 'None', '1300', 'None', 'None', 'None', 'None', 'None',
'2070', '562', 'None', 'None', 'None', '5000', 'None', 'None', '515', 'None', '2050', '2100', 'None', 'None', 'None', 'None', '1626', 'None', 'None', 'None',
'None', 'None', '846', 'None', 'None', 'None', '5000', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'3500', '4075', 'None', 'None', 'None', '1025', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '800', 'None', 'None', 'None', '563', '2000', '1483',
'None', 'None', 'None', 'None', '1405', 'None', 'None', 'None', '1100', 'None', '2197', 'None', 'None', '2677', 'None', '1338', 'None', '1897', 'None', 'None',
'1405', '2200', 'None', '830', 'None', '1483', 'None', 'None', 'None', '1157', 'None', 'None', '1369', 'None', 'None', '2622', '1000', '1010', 'None', '2194',
'None', 'None', '4000', 'None', 'None', '1549', 'None', '3000', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', '2100', 'None', 'None', 'None', 'None', 'None', '993', 'None', '785', '2309', 'None', '1132', '1447', 'None', 'None', 'None',
'None', 'None', 'None', '1416', 'None', 'None', 'None', '2600', 'None', 'None', 'None', '984', 'None', 'None', 'None', 'None', 'None', '1383', 'None', 'None',
'None', 'None', '1141', 'None', '0000', '3297', '1358', 'None', '3000', '3092', 'None', 'None', 'None', 'None', 'None', 'None', '1860', '1158', '1772', 'None',
'None', 'None', 'None', 'None', 'None', '1500', '1483', 'None', 'None', 'None', 'None', 'None', 'None', '1900', '1313', 'None', 'None', 'None', 'None', 'None',
'1300', 'None', '1441', '3092', '1000', '2500', 'None', 'None', '2350', '2651', 'None', 'None', 'None', 'None', '1589', 'None', '1736', 'None', 'None', 'None',
'2812', '1324', 'None', 'None', '4409', '1072', 'None', 'None', 'None', 'None', '3400', '2700', '2158', 'None', '1185', '2252', 'None', 'None', 'None', '1615',
'None', '879', '2615', 'None', '1978', 'None', '1000', '998', 'None', 'None', 'None', '1897', 'None', '1608', 'None', '1500', 'None', '700', 'None', 'None',
'None', 'None', '846', '1975', 'None', '2900', 'None', 'None', 'None', 'None', '2200', 'None', 'None', 'None', '2000', 'None', '2986', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1050', 'None', 'None', 'None', 'None', 'None', '1493', '2069', 'None', 'None', 'None', 'None',
'2821', 'None', 'None', 'None', '2000', '1657', 'None', 'None', 'None', 'None', 'None', 'None', '3619', '2472', '2525', 'None', 'None', 'None', '1500', 'None',
'1550', 'None', 'None', 'None', '2068', 'None', 'None', 'None', '633', 'None', '1758', 'None', '1897', 'None', '2000', 'None', 'None', 'None', '780', 'None',
'None', 'None', '1500', 'None', 'None', 'None', '2546', 'None', 'None', 'None', 'None', 'None', '1975', 'None', 'None', 'None', 'None', '4275', 'None', '2000',
'2615', 'None', 'None', 'None', 'None', '670', '1385', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '5000', 'None', '780', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', '605', 'None', 'None', '565', 'None', 'None', 'None', 'None', '1660', 'None', '1369', '1400', 'None', 'None', 'None',
'700', '2000', 'None', '3368', 'None', '3500', 'None', '1634', '1141', '2843', 'None', '1041', 'None', '2070', 'None', 'None', 'None', 'None', 'None', '2036',
'None', '5000', 'None', 'None', 'None', '2100', 'None', 'None', '3740', 'None', '1319', '846', 'None', 'None', 'None', '3000', 'None', 'None', 'None', 'None',
'None', '2500', 'None', '1900', '2252', 'None', 'None', 'None', 'None', 'None', '1582', '2240', '1897', '1235', 'None', '563', 'None', '1200', '3000', 'None',
'None', 'None', 'None', '1290', 'None', 'None', 'None', 'None', '650', 'None', '2500', '2300', '2800', 'None', 'None', 'None', 'None', 'None', '3000', 'None',
'None', 'None', '3000', '1453', 'None', '1800', 'None', 'None', '1100', '2400', '645', 'None', 'None', 'None', '2000', 'None', 'None', 'None', 'None', 'None',
'900', '1002', 'None', 'None', 'None', '1390', 'None', 'None', '1497', 'None', 'None', '1000', 'None', '1240', 'None', '789', 'None', 'None', 'None', '3000',
'None', 'None', 'None', 'None', '2688', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1259', 'None', 'None',
'3000', '885', '1100', 'None', 'None', 'None', 'None', 'None', 'None', '815', 'None', 'None', '756', 'None', '1150', 'None', 'None', '600', 'None', '1655',
'None', 'None', '3000', 'None', 'None', '3000', 'None', 'None', 'None', 'None', '800', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '9334',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '641', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '641', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1967',
'None', 'None', 'None', 'None', 'None', 'None', '5000', 'None', 'None', 'None', '2064', 'None', '1522', '800', 'None', 'None', 'None', 'None', 'None', '1259',
'None', 'None', 'None', 'None', 'None', '1951', 'None', 'None', '1100', 'None', 'None', '4750', '1002', 'None', 'None', 'None', 'None', 'None', 'None', '3700',
'4000', 'None', 'None', 'None', 'None', 'None', 'None', '1014', '885', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '765', 'None', 'None',
'None', 'None', 'None', '1600', 'None', 'None', 'None', 'None', 'None', '2500', 'None', 'None', 'None', 'None', '860', 'None', 'None', 'None', 'None', 'None',
'None', '5000', 'None', '800', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '562', '953', 'None', 'None', 'None', '1200', 'None', 'None', 'None',
'None', '980', 'None', 'None', 'None', 'None', 'None', '1347', 'None', 'None', 'None', 'None', '1100', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'443', 'None', '695', 'None', 'None', 'None', 'None', 'None', 'None', '1690', '6500', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '695', 'None',
'None', 'None', '443', '13577', 'None', '2300', '1290', 'None', 'None', 'None', '2400', 'None', '8000', 'None', '1400', 'None', '1240', 'None', 'None', 'None',
'1063', 'None', 'None', 'None', 'None', 'None', 'None', '562', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', '600', '4000', 'None', 'None', 'None', 'None', 'None', '2064', '4200', '3400', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', '1330', '1478', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1200', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', '2581', '720', '1950', 'None', 'None', '1214', 'None', 'None', 'None', '4280', '3500', 'None', '4000', 'None', 'None',
'4750', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '2790', 'None', '1060', 'None', 'None', 'None', 'None',
'None', '2167', '3000', 'None', 'None', 'None', 'None', 'None', '1950', 'None', 'None', '1100', 'None', 'None', 'None', 'None', '1045', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', '700', 'None', 'None', 'None', 'None', '859', 'None', '4000', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'1060', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '2525', 'None', 'None', 'None', 'None', 'None', 'None', '1804', 'None',
'2600', 'None', 'None', 'None', 'None', '3000', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1290', 'None',
'None', 'None', '5000', '2921', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '4541', '5000', 'None', '672', '1599', '3000', '4000',
'None', 'None', 'None', '1082', 'None', 'None', 'None', '800', 'None', 'None', 'None', 'None', 'None', '4000', 'None', 'None', 'None', '7166', 'None', 'None',
'None', 'None', 'None', 'None', 'None', '1043', 'None', '2000', 'None', 'None', 'None', '1585', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', '6000', 'None', 'None', 'None', '1682', 'None', 'None', 'None', '7000', 'None', '1214', 'None', 'None', 'None', '1600', '4000', 'None', 'None', '1060',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1100', 'None', 'None', 'None', '5400', 'None', 'None',
'2600', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1430', '5000', 'None', 'None', 'None', '2000', 'None', 'None', 'None', 'None', 'None',
'None', 'None', '4570', '854', '1024', 'None', '1599', 'None', 'None', 'None', '4500', 'None', '2500', 'None', '1800', 'None', 'None', 'None', 'None', '14863',
'None', 'None', 'None', 'None', 'None', 'None', '2600', 'None', 'None', '1261', 'None', '4000', 'None', '3074', 'None', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '4000', '966', '620', '1000', 'None', 'None', '2800', '3333',
'None', 'None', 'None', 'None', 'None', 'None', 'None', '765', 'None', 'None', 'None', '2644', 'None', 'None', 'None', '1024', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '3448', '1325', 'None', 'None', 'None', 'None', '1851', 'None', 'None', '1127',
'None', 'None', '1095', '827', '5000', 'None', '558', 'None', 'None', 'None', '1339', '4000', 'None', 'None', '1300', 'None', '1300', 'None', '1097', 'None',
'None', 'None', 'None', 'None', 'None', 'None', '827', 'None', 'None', '995', 'None', '4000', '641', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'720', 'None', '1800', 'None', 'None', 'None', 'None', 'None', 'None', '3800', 'None', '664', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',
'943', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '966', 'None', '1049', '1049', '1075', 'None', 'None', 'None', 'None', 'None', 'None', '713',
'1100', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '600', 'None', 'None', 'None', 'None', '1095', 'None', '760', 'None', 'None', '1043', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '2650', 'None', '672', 'None', '2288', 'None', 'None', 'None', 'None', 'None',
'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '4000', 'None', '8400', '2000', 'None',
'2184', 'None', 'None', '2950', '1127', 'None', 'None', 'None', 'None', '1100', '1900', 'None', '1733', '1600', 'None', 'None', 'None', 'None', '713', 'None',
'None', 'None', '995', 'None', '1045', '760', 'None', 'None', 'None', 'None', '799', 'None', 'None', 'None', 'None', 'None', 'None', '600', 'None', 'None',
'700', 'None', 'None', 'None', 'None', 'None', '1200', '5500', '615', '1990', 'None', '2595', '700', '3000', '3101', '1028', 'None', 'None', '2199', 'None',
'2800', 'None', '2144', '984', '3330', '2038', '1846', 'None', '646', 'None', 'None', '1650', 'None', '1078', '2792', '4109', 'None', 'None', '1840', '1752',
'1116', '1850', 'None', 'None', '1114', 'None', '2208', 'None', 'None', '2913', 'None', '1154', '2169', 'None', 'None', 'None', '1029', '3787', '1800', 'None',
'3300', 'None', '1374', '2100', 'None', 'None', '4118', 'None', 'None', 'None', 'None', 'None', 'None', '3258', '1237', 'None', '2195', '3350', 'None', '1450',
'2265', '913', '1270', '1000', '784', '1300', '2471', '1064', 'None', '1617', '3496', '650', '1550', 'None', 'None', '1161', 'None', '1520', 'None', 'None',
'4268', 'None', '2285', '1107', '1630', '808', 'None', '1460', '1154', '750', '1154', '1300', '1880', '29952', 'None', '3242', '861', 'None', 'None', 'None',
'None', '1100', '1946', '1078', 'None', '961', '2700', '3261', 'None', 'None', 'None', 'None', '2901', '1600', '2948', 'None', 'None', '2147', '660', '1285',
'None', 'None', '2352', '1626', '2463', '500', 'None', 'None', '545', '1154', 'None', '1508', '1555', '3217', '1370', 'None', '1161', 'None', 'None', '1161',
'None', 'None', '9000', 'None', '549', 'None', 'None', '1217', 'None', '5957', '1372', '6000', '1749', 'None', 'None', '762', 'None', '545', '2666', '2253',
'682', '907', '2152', '1639', 'None', 'None', '1930', '1116', '1000', '1300', '3350', '966', '2351', '1848', '1070', 'None', 'None', 'None', '1260', '1229',
'856', '1445', 'None', '1740', 'None', 'None', '1956', '1116', 'None', 'None', 'None', '1994', '1614', '1154', 'None', 'None', '4909', 'None', '1886', '1776',
'None', 'None', '861', '2364', '1446', 'None', 'None', 'None', 'None', '500', '2223', '500', '1687', '762', '450', '1154', 'None', '2076', '1013', '1237',
'1161', '1734', '1146', '2000', '1080', '2703', '1542', '1876', '1803', 'None', '2756', 'None', '3878', 'None', 'None', '964', 'None', '2733', '2200', '500',
'None', '2575', 'None', '2024', '1779', '1665', '1665', 'None', '1278', 'None', '1015', 'None', 'None', '1950', '1506', '1402', 'None', '1018', '1055', '5957',
'None', 'None', '1838', '1886', '1714', '808', '1994', '2300', 'None', '3000', '2368', 'None', '2316', 'None', '1116', '3780', '1000', 'None', 'None', '500',
'1154', '2097', 'None', 'None', 'None', '2153', 'None', '2165', 'None', 'None', '689', 'None', 'None', '3900', 'None', '3100', '2296', '1154', '1500', '3516',
'1423', '3000', 'None', '961', 'None', 'None', '3744', '1510', '1770', '1900', '1800', 'None', '1158', '3340', '1044', '853', '1709', 'None', '1641', '1475',
'1116', '2374', '3345', '1094', '1154', 'None', 'None', '1380', 'None', '1288', '1260', '1549', 'None', '1273', 'None', '1925', '2500', '950', 'None', 'None',
'1975', '1005', '700', '920', '2733', '2162', 'None', '3395', 'None', 'None', '3300', '1799', '2845', '1466', '1660', 'None', 'None', '1736', '545', '2925',
'None', 'None', 'None', '2121', 'None', '1506', 'None', 'None', 'None', '1666', 'None', '8222', 'None', 'None', '1264', '1250', '1384', 'None', 'None', '2942',
'2200', 'None', '700', '4633', 'None', '3078', 'None', '2036', '1450', '2366', '3896', 'None', '1609', 'None', '900', '545', '1650', '2123', '2230', '2193',
'None', '1413', '2664', 'None', '5771', '1632', '1154', 'None', '1189', 'None', '527', '1519', '920', 'None', '1583', 'None', 'None', 'None', '2000', '2560',
'976', '913', 'None', 'None', '1436', 'None', '488', '1071', '1398', '545', '1501', '545', '2309', 'None', '1500', 'None', 'None', '1639', '2288', '2193',
'None', '6394', 'None', 'None', '3696', '545', 'None', 'None', '1300', 'None', '749', '1887', '1772', '1766', 'None', '1284', '3430', '1894', '2824', '2418',
'1922', 'None', '2133', '785', '1411', 'None', 'None', '1154', '2063', 'None', 'None', '1154', 'None', '2400', 'None', '1687', '1947', '1294', 'None', '1311',
'None', 'None', '2487', '2629', '650', '1146', 'None', 'None', '1926', 'None', '1387', 'None', 'None', 'None', '1750', '4687', '1654', '1730', '1786', 'None',
'None', 'None', 'None', '2708', '4220', '6000', 'None', 'None', 'None', '1200', 'None', '1500', '2440', '1402', 'None', '579', '1494', '1938', '941', '2250',
'2038', '1786', 'None', 'None', '1765', '1260', '1535', '1383', 'None', 'None', '2597', 'None', '629', '6912', 'None', '2643', 'None', 'None', 'None', 'None',
'2230', 'None', '897', '6000', 'None', 'None', '1230', '540', '1806', '1996', 'None', '500', 'None', '1562', '753', 'None', '798', 'None', '961', 'None',
'2400', '3018', '1325', '1149', 'None', 'None', '1154', '1154', '1000', '1550', 'None', 'None', '1505', '3800', '2577', 'None', 'None', 'None', '1700', '2100',
'2160', '1520', '1229', 'None', '625', '2470', 'None', '500', 'None', '10129', '2217', '2107', '1928', 'None', 'None', 'None', '2590', 'None', '1400', 'None',
'1959', '1294', '4000', '3892', 'None', '2040', '3600', '1220', 'None', '1400', '627', 'None', '1764', '487', '2132', 'None', '1314', '500', '500', '2947',
'2190', '3265', '1100', 'None', 'None', '990', 'None', '1649', 'None', '737', 'None', 'None', '1388', 'None', 'None', '1900', '1088', '1890', '917', '1166',
'1154', '2174', '1851', '1708', '2888', '1622', '2914', '2159', '765', '2181', 'None', '2147', '1154', 'None', 'None', '2094'
]
to_pop = [845, 1504, 1554, 1882, 1891, 1926, 1943] ## This had to be added due to inconsistencies in the datasets when I transfered over all the data. I'm not sure why it happened but 7 entries were unable to be processed using the code in this ipynb file to open the JSON file.
for pop in to_pop:
    sqfootage.pop(pop)
df.insert(len(df.iloc[0]), "square_footage", sqfootage)

In [46]:
df2 = df.copy()

#print(df2.head(30).to_string())


## Create Geohash Feature
#df2['geohash'] = df2.apply(lambda row: encode(row['latitude'], row['longitude'], precision=6), axis=1)
#df2.drop(columns=['latitude', 'longitude'], inplace=True, errors='ignore')

## Drop Rows with Missing 'listPrice', and show how many were dropped
print(f"Number of rows with missing 'listPrice': {df2['listPrice'].isna().sum()}")
df2.dropna(subset=['listPrice'], inplace=True)

# Drop outliers that will add noise to the model, not going to predict for outliers
# Drop propertyType in 'FARM', 'Land', 'Other', 'Multi Family'
df2 = df2[~df2['propertyType'].isin(['FARM', 'Land', 'Other', 'Multi_Family'])]
# Drop garage greater than 4
###df2 = df2[df2['garage'] <= 4]

# Fix column names
df2.columns = df2.columns.str.replace(r' +', '_', regex=True)

# Categorical variables levels cleanup
df2['Amenities_Utilities_Heating_Cooling_Cooling'] = df2['Amenities_Utilities_Heating_Cooling_Cooling'].str.replace(r'Ductless.*|.*Central .*', 'Y', regex=True)
df2['Amenities_Utilities_Heating_Cooling_Cooling'] = df2['Amenities_Utilities_Heating_Cooling_Cooling'].str.replace(r'^(?!.*Y).*', 'N', regex=True)
df2['Amenities_Utilities_Heating_Cooling_Heat_Source'] = df2['Amenities_Utilities_Heating_Cooling_Heat_Source'].str.replace(r'Electric.*', 'Electric', regex=True)
df2['Amenities_Utilities_Heating_Cooling_Heat_Source'] = df2['Amenities_Utilities_Heating_Cooling_Heat_Source'].str.replace(r'Natural gas.*', 'Gas', regex=True)
df2['Amenities_Utilities_Heating_Cooling_Heat_Source'] = df2['Amenities_Utilities_Heating_Cooling_Heat_Source'].str.replace(r'^(?!Electric|Gas).*', 'Other', regex=True)


## 1.6. One-Hot Encode Categorical Features
# One-hot encode categorical features: 'city', 'neighborhoodName', 'propertyType', 'parking', 'garage', 'Amenities  Utilities_Heating  Cooling_Cooling', 'Amenities  Utilities_Heating  Cooling_Heat Type'
categorical_features = ['propertyType', 'Amenities_Utilities_Heating_Cooling_Cooling', 'Amenities_Utilities_Heating_Cooling_Heat_Source']
df2 = pd.get_dummies(df2, columns=categorical_features, drop_first=True)*1

# Fix column names (because of values and their one-hot encoding)
df2.columns = df2.columns.str.replace(r' +', '_', regex=True)




## Impute Missing Values

# Impute missing year built with the median year built for the property type
#df2['yearBuilt'] = df2.groupby('propertyType')['yearBuilt'].transform(lambda x: x.fillna(x.median()))
# replace all non-numeric values in column df2[['yearBuilt']] with Nan
df2['yearBuilt'] = pd.to_numeric(df2['yearBuilt'], errors='coerce')
df2['numBedrooms'] = pd.to_numeric(df2['numBedrooms'], errors='coerce')
df2['numBathrooms'] = pd.to_numeric(df2['numBathrooms'], errors='coerce')
df2['lotSize'] = pd.to_numeric(df2['lotSize'], errors='coerce')
df2['square_footage'] = pd.to_numeric(df2['square_footage'], errors='coerce')

scaler = MinMaxScaler()
numerical_features = ['yearBuilt', 'latitude', 'longitude', 'numBedrooms', 'numBathrooms', 'lotSize', 'parking', 'square_footage', 'garage']
df2[numerical_features] = scaler.fit_transform(df2[numerical_features])

# Impute yearBuilt using KNNImputer
imputer = KNNImputer(n_neighbors=5)
# Scale yearBuilt for KNNImputer
df2['yearBuilt'] = imputer.fit_transform(df2[['yearBuilt', 'propertyType_Single_Family', 'propertyType_TOWNHOUSE', 'latitude', 'longitude']])

# Impute missing lot size based on median lot size for similar properties (same property type, city, and number of bedrooms)
#df2['lotSize'] = df2.groupby(['propertyType', 'numBedrooms', 'numBathrooms'])['lotSize'].transform(lambda x: x.fillna(x.median()))
# Impute lotSize based on numBedrooms and numBathrooms using KNNImputer

#imputer = KNNImputer(n_neighbors=5)
df2['lotSize'] = imputer.fit_transform(df2[['lotSize', 'numBedrooms', 'numBathrooms', 'propertyType_Single_Family', 'propertyType_TOWNHOUSE', 'parking']])

# Similar imputation strategy as 'lotSize'
#df2['square_footage'] = df2.groupby(['propertyType', 'numBedrooms', 'numBathrooms'])['square_footage'].transform(lambda x: x.fillna(x.median()))

#imputer = KNNImputer(n_neighbors=5)
df2['square_footage'] = imputer.fit_transform(df2[['square_footage', 'numBedrooms', 'numBathrooms', 'propertyType_Single_Family', 'propertyType_TOWNHOUSE', 'parking']])

#imputer = KNNImputer(n_neighbors=5)
df2['garage'] = imputer.fit_transform(df2[['garage', 'numBedrooms', 'numBathrooms', 'propertyType_Single_Family', 'propertyType_TOWNHOUSE', 'parking']])

#imputer = KNNImputer(n_neighbors=5)
df2['distance_to_lake'] = imputer.fit_transform(df2[['square_footage', 'numBedrooms', 'numBathrooms', 'propertyType_Single_Family', 'propertyType_TOWNHOUSE', 'parking']])

# drop all rows that have NaN in any column
columns_to_check = ['garage', 'parking', 'numBedrooms', 'numBathrooms', 'square_footage']
df2.dropna(subset=columns_to_check, inplace=True)

#print(df2[[]].isna().sum())
print(df2.head(30).to_string())
#print(df2.to_string())




Number of rows with missing 'listPrice': 0
    latitude  listPrice  longitude   lotSize  numBathrooms  numBedrooms   parking  yearBuilt    garage  distance_to_lake  square_footage  propertyType_Multi_Family  propertyType_Single_Family  propertyType_TOWNHOUSE  Amenities_Utilities_Heating_Cooling_Cooling_Y  Amenities_Utilities_Heating_Cooling_Heat_Source_Gas  Amenities_Utilities_Heating_Cooling_Heat_Source_Other
0   0.984358     995000   0.010590  0.007382      0.028571     0.022222  0.045455   0.928000  0.026316          0.040064        0.040064                          0                           1                       0                                              1                                                    1                                                      0
1   0.983963    1190000   0.010911  0.007382      0.028571     0.044444  0.068182   0.869333  0.026316          0.046995        0.046995                          0                           1                       0

In [47]:

# ## 1.7. Handle Outliers
# # Identify outliers in 'listPrice' based on IQR and remove them
# Q1 = df2['listPrice'].quantile(0.25)
# Q3 = df2['listPrice'].quantile(0.75)
# IQR = Q3 - Q1
# upper_bound = Q3 + 1.5 * IQR
# lower_bound = Q1 - 1.5 * IQR
# df2 = df2[(df2['listPrice'] >= lower_bound) & (df2['listPrice'] <= upper_bound)]

# 2. Feature Engineering

## 2.1. Combine Bathrooms and Bedrooms
# Create a new feature representing the total number of rooms (bedrooms + bathrooms)
#df2['totalRooms'] = df2['numBedrooms'] + df2['numBathrooms']

# 3. Split Data into Training and Testing Sets
X = df2.drop('listPrice', axis=1)
y = df2['listPrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Model Selection and Training
# Try different models: Linear Regression, Ridge, Lasso, ElasticNet, Decision Tree, Random Forest, Gradient Boosting
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# 5. Hyperparameter Tuning
# Use GridSearchCV or RandomizedSearchCV to find the best hyperparameters for each model
param_grids = {
    'Ridge': {'alpha': np.logspace(-3, 3, 7)},
    'Lasso': {'alpha': np.logspace(-3, 3, 7)},
    'ElasticNet': {'alpha': np.logspace(-3, 3, 7), 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]},
    'Decision Tree': {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [100, 200, 500], 'max_depth': [None, 5, 10, 30]},
    'Gradient Boosting': {'n_estimators': [100, 200, 500], 'learning_rate': [0.01, 0.1, 1]}
}

best_models = {}
for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_models[name] = model

# 6. Model Evaluation
# Evaluate the best models using metrics like MSE, R-squared, MAE, RMSE
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f'Model: {name}')
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'R-squared: {r2_score(y_test, y_pred)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print('---')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Model: Linear Regression
MSE: 360984210831.97687
R-squared: 0.5382041449232218
MAE: 318846.9139129704
RMSE: 600819.6158848152
---
Model: Ridge
MSE: 361117559007.5886
R-squared: 0.5380335567563941
MAE: 318462.3690486643
RMSE: 600930.5775275449
---
Model: Lasso
MSE: 361007036596.21954
R-squared: 0.5381749446341235
MAE: 318795.06755325524
RMSE: 600838.6111063599
---
Model: ElasticNet
MSE: 361007036596.21954
R-squared: 0.5381749446341235
MAE: 318795.06755325524
RMSE: 600838.6111063599
---
Model: Decision Tree
MSE: 321833265109.8047
R-squared: 0.5882887301053987
MAE: 264060.7491090316
RMSE: 567303.5035232946
---
Model: Random Forest
MSE: 163867143514.01767
R-squared: 0.7903698683007838
MAE: 195897.30471856656
RMSE: 404805.06853795407
---
Model: Gradient Boosting
MSE: 140674210959.14993
R-squared: 0.8200398643824087
MAE: 205034.9004252538
RMSE: 375065.6088728343
---


In [48]:
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f'Model: {name}')
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'R-squared: {r2_score(y_test, y_pred)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'%Error: { abs(sum(y_test) - sum(y_pred))/sum(y_test) * 100 }%')
    print('---')

Model: Linear Regression
MSE: 360984210831.97687
R-squared: 0.5382041449232218
MAE: 318846.9139129704
RMSE: 600819.6158848152
%Error: 2.083551431030492%
---
Model: Ridge
MSE: 361117559007.5886
R-squared: 0.5380335567563941
MAE: 318462.3690486643
RMSE: 600930.5775275449
%Error: 2.085230854572631%
---
Model: Lasso
MSE: 361007036596.21954
R-squared: 0.5381749446341235
MAE: 318795.06755325524
RMSE: 600838.6111063599
%Error: 2.0837174558447416%
---
Model: ElasticNet
MSE: 361007036596.21954
R-squared: 0.5381749446341235
MAE: 318795.06755325524
RMSE: 600838.6111063599
%Error: 2.0837174558447416%
---
Model: Decision Tree
MSE: 321833265109.8047
R-squared: 0.5882887301053987
MAE: 264060.7491090316
RMSE: 567303.5035232946
%Error: 3.3720597146635645%
---
Model: Random Forest
MSE: 163867143514.01767
R-squared: 0.7903698683007838
MAE: 195897.30471856656
RMSE: 404805.06853795407
%Error: 1.9385095302746285%
---
Model: Gradient Boosting
MSE: 140674210959.14993
R-squared: 0.8200398643824087
MAE: 205034.