This notebook should include preliminary and baseline modeling.
- Try as many different models as possible.
- Don't worry about hyperparameter tuning or cross validation here.
- Ideas include:
    - linear regression
    - support vector machines
    - random forest
    - xgboost

In [167]:
# import models and fit
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

#Loading data
test_data = pd.read_csv('/Users/elizaclapasmac/Desktop/Repositories/data_project_midterm/notebooks/test_data.csv') 
train_data = pd.read_csv('/Users/elizaclapasmac/Desktop/Repositories/data_project_midterm/notebooks/train_data.csv')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print('Test data shape', test_data.shape)
print('Train data shape', train_data.shape)

test_data.head()

Test data shape (20677, 306)
Train data shape (48244, 306)


Unnamed: 0,last_update_date,status,list_price,property_id,community,listing_id,matterport,source.agents,source.type,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,description.type,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,products.brand_name,other_listings.rdc,location.address.postal_code,location.address.coordinate.lat,location.address.state_code,location.address.line,location.county.fips_code,location.county.name,primary_photo,source,location.address.coordinate,other_listings,branding.name,branding.type,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin
0,2023-12-18T14:08:48Z,sold,525000.0,5989887000.0,False,2960242000.0,False,"[{'office_name': None}, {'office_name': None}]",mls,1986.0,2023-12-18,518444.0,2.0,8159.0,1280.0,2.0,2.0,1.0,2.0,condos,True,False,False,essentials,"[{'listing_id': '2960242026', 'listing_key': N...",21403.0,38.960498,MD,7004 Channel Village Ct Apt 101,24003.0,Anne Arundel,True,True,True,True,Long & Foster Bethesda All Points,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2023-10-11T13:05:30Z,sold,235000.0,9052323000.0,False,2960437000.0,False,"[{'office_name': None}, {'office_name': None}]",mls,2018.0,2023-10-10,227500.0,2.0,5227.0,1342.0,2.0,1.0,1.0,2.0,single_family,True,False,False,basic_opt_in,"[{'listing_id': '2960437288', 'listing_key': N...",29045.0,34.110015,SC,1415 Ardennes Dr,45079.0,Richland,True,True,True,True,Non-MLS,Office,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,2023-12-01T14:28:33Z,sold,345000.0,2276326000.0,False,2961095000.0,False,"[{'office_name': None}, {'office_name': None}]",mls,1936.0,2023-11-30,345000.0,1.0,1363.0,850.0,1.0,2.0,1.0,2.0,condos,True,False,False,basic_opt_in,"[{'listing_id': '2960289858', 'listing_key': N...",80207.0,39.760841,CO,3035 Ash St,8031.0,Denver,True,True,True,True,Modern Design Realty Group LLC,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2024-01-14T15:48:55Z,sold,299000.0,5834621000.0,False,2962272000.0,False,"[{'office_name': None}, {'office_name': None}]",mls,2006.0,2024-01-12,299000.0,2.0,6970.0,1706.0,2.0,2.0,1.0,3.0,single_family,True,False,False,essentials,"[{'listing_id': '2962271828', 'listing_key': N...",32311.0,32.348511,FL,5560 Hampton Hill Cir,12073.0,Leon,True,True,True,True,The Brokerage,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2024-01-15T13:43:42Z,sold,360000.0,3489076000.0,False,2960891000.0,False,"[{'office_name': None}, {'office_name': None}]",mls,1984.0,2024-01-11,325000.0,1.0,8159.0,1445.0,2.0,1.0,2.0,2.0,condos,True,False,False,essentials,"[{'listing_id': '2960891228', 'listing_key': N...",2908.0,41.875847,RI,25 Venturi Grn Unit B,44007.0,Providence,True,True,True,True,Homesmart Professionals Re,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [168]:
# Preparing Train Data to input in models by converting columns to appropriate data types

# >>>Last Update Date<<<
# Convert 'last_update_date' column to datetime
train_data['last_update_date'] = pd.to_datetime(train_data['last_update_date'], errors='coerce')

# Convert all values to a consistent format
train_data['last_update_date'] = train_data['last_update_date'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')

# Convert 'last_update_date' column to datetime again to handle the new format
train_data['last_update_date'] = pd.to_datetime(train_data['last_update_date'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')

# Convert 'last_update_date' column to Unix epoch time (seconds since 1970-01-01)
train_data['last_update_date'] = (train_data['last_update_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# >>>Sold Date<<<
# Convert 'description.sold_date' column to datetime
train_data['description.sold_date'] = pd.to_datetime(train_data['description.sold_date'])

# Convert to Unix epoch time
train_data['description.sold_date'] = (train_data['description.sold_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# >>>Year Built, Sold Price, Baths Full, Lot SqFt, SqFt, Garage, Stories, Beds, and County Fips Code to numeric<<<
columns_to_convert = ['description.year_built', 'description.sold_price', 'description.baths_full', 'description.sqft', 'description.garage', 'description.stories', 'description.beds', 'location.county.fips_code']
train_data[columns_to_convert] = train_data[columns_to_convert].apply(pd.to_numeric, errors='coerce')

print('Train Data Shape with NaN', train_data.shape)
train_data = train_data.dropna()
print('Train Data Shape with no NaN', train_data.shape)

Train Data Shape with NaN (48244, 306)
Train Data Shape with no NaN (48055, 306)


In [169]:
# Preparing Test Data to input in models by converting columns to appropriate data types

# >>>Last Update Date<<<
test_data['last_update_date'] = pd.to_datetime(test_data['last_update_date'], errors='coerce')

# Convert all values to a consistent format
test_data['last_update_date'] = test_data['last_update_date'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')

# Convert 'last_update_date' column to datetime again to handle the new format
test_data['last_update_date'] = pd.to_datetime(test_data['last_update_date'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')

# Convert 'last_update_date' column to Unix epoch time (seconds since 1970-01-01)
test_data['last_update_date'] = (test_data['last_update_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# >>>Sold Date<<<
# Convert 'description.sold_date' column to datetime
test_data['description.sold_date'] = pd.to_datetime(test_data['description.sold_date'])

# Convert to Unix epoch time
test_data['description.sold_date'] = (test_data['description.sold_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# >>>Year Built, Sold Price, Baths Full, Lot SqFt, SqFt, Garage, Stories, Beds, and County Fips Code to numeric<<<
columns_to_convert = ['description.year_built', 'description.sold_price', 'description.baths_full', 'description.sqft', 'description.garage', 'description.stories', 'description.beds', 'location.county.fips_code']
test_data[columns_to_convert] = test_data[columns_to_convert].apply(pd.to_numeric, errors='coerce')

print('Test Data Shape with NaN', test_data.shape)
test_data = test_data.dropna()
print('Test Data Shape with no NaN', test_data.shape)


Test Data Shape with NaN (20677, 306)
Test Data Shape with no NaN (20599, 306)


In [170]:
X_train.head()

Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,location.address.postal_code,location.address.coordinate.lat,location.county.fips_code,primary_photo,source,location.address.coordinate,other_listings,branding.type,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin,branding.name_encoded,description.type_apartment,description.type_condo_townhome_rowhome_coop,description.type_condos,description.type_duplex_triplex,description.type_land,description.type_mobile,description.type_multi_family,description.type_single_family,description.type_townhomes,products.brand_name_advantage_brand,products.brand_name_advantage_pro,products.brand_name_basic_opt_in,products.brand_name_essentials,state_code_enc,county_name_enc
0,1703622000.0,1244000.0,2591419000.0,False,2961534000.0,True,1977.0,1703548800,3.0,37462.0,2903.0,3.0,2.0,1.0,3.0,True,False,False,85028.0,33.564398,4013.0,True,True,True,True,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1195000.0,False,False,False,False,False,False,False,True,False,False,False,False,True,2.0,33.0
1,1705453000.0,1075000.0,4291892000.0,False,2959884000.0,False,1910.0,1705363200,1.0,8159.0,1528.0,2.0,2.0,2.0,3.0,True,False,False,2908.0,42.379463,25025.0,True,True,True,True,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,950000.0,False,False,True,False,False,False,False,False,False,False,False,True,False,13.0,49.0
2,1702920000.0,245000.0,9438858000.0,False,2960056000.0,False,2018.0,1702857600,2.0,2000.0,1220.0,2.0,2.0,2.0,3.0,True,False,False,19901.0,39.160328,10001.0,True,True,True,True,Office,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,306317.8,False,False,False,False,False,False,False,False,True,False,False,True,False,6.0,29.0
3,1705335000.0,225000.0,5790648000.0,False,2962364000.0,False,1994.0,1705017600,2.0,6098.0,1054.0,2.0,2.0,1.0,3.0,True,False,False,32311.0,32.348511,12073.0,True,True,True,True,Office,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,324381.1,False,False,False,False,False,False,False,True,False,False,False,False,True,7.0,31.0
4,1702114000.0,211000.0,3958063000.0,False,2961431000.0,False,1973.0,1701993600,2.0,1363.0,1509.0,2.0,2.0,1.0,4.0,True,False,False,40601.0,38.176858,21073.0,True,True,True,True,Office,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,202282.3,False,False,False,False,False,False,False,True,False,False,False,False,True,12.0,19.0


In [133]:
#X_train and y_train for models only taking into account numerical values 

X_train = train_data.drop(columns = ['description.sold_price', 'source.agents', 'other_listings.rdc', 'status', 'source.type', 'description.type', 'products.brand_name', 'location.address.state_code', 'location.address.line', 'location.county.name', 'branding.name', 'branding.type'])
print(X_train.shape)

y_train = train_data['description.sold_price']
print(y_train.shape)


(48055, 294)
(48055,)


In [134]:
#X_test and y_test for models, only taking into account numerical values 

X_test = test_data.drop(columns = ['description.sold_price', 'source.agents', 'other_listings.rdc', 'status', 'source.type', 'description.type', 'products.brand_name', 'location.address.state_code', 'location.address.line', 'location.county.name', 'branding.name', 'branding.type'])
print(X_test.shape)

y_test = test_data['description.sold_price']
print(y_test.shape)

(20599, 294)
(20599,)


In [135]:
# Converting y_train and y_test into Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

## Linear Regression

In [136]:
# Initialize model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred_lr = lr_model.predict(X_test)

## Support Vector Machines

In [137]:
# Initialize model
svr_model = SVR()

# Train the model
svr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svr = svr_model.predict(X_test)

## Random Forest

In [138]:
# Initialize the model
rf_model = RandomForestRegressor(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

## XGBoost

In [139]:
# Initialize the model
xgb_model = xgb.XGBRegressor(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

Consider what metrics you want to use to evaluate success.
- If you think about mean squared error, can we actually relate to the amount of error?
- Try root mean squared error so that error is closer to the original units (dollars)
- What does RMSE do to outliers?
- Is mean absolute error a good metric for this problem?
- What about R^2? Adjusted R^2?
- Briefly describe your reasons for picking the metrics you use

In [140]:
# gather evaluation metrics and compare results
import warnings
warnings.filterwarnings("ignore")

# Evaluate Linear Regression Model
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Metrics:")
print(f"MSE: {mse_lr}")
print(f"RMSE: {rmse_lr}")
print(f"MAE: {mae_lr}")
print(f"R²: {r2_lr}")

# Evaluate Support Vector Machines Model
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("\n\nSupport Vector Machines Metrics:")
print(f"MSE: {mse_svr}")
print(f"RMSE: {rmse_svr}")
print(f"MAE: {mae_svr}")
print(f"R²: {r2_svr}")

# Evaluate Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\n\nRandom Forest Metrics:")
print(f"MSE: {mse_rf}")
print(f"RMSE: {rmse_rf}")
print(f"MAE: {mae_rf}")
print(f"R²: {r2_rf}")

# Evaluate XGBoost model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("\n\nXGBoost Metrics:")
print(f"MSE: {mse_xgb}")
print(f"RMSE: {rmse_xgb}")
print(f"MAE: {mae_xgb}")
print(f"R²: {r2_xgb}")

Linear Regression Metrics:
MSE: 516699870.6840744
RMSE: 22731.033207579334
MAE: 12361.240595746758
R²: 0.9930525185582318


Support Vector Machines Metrics:
MSE: 79089145396.9458
RMSE: 281227.92428374855
MAE: 196740.40908176132
R²: -0.06342269674486256


Random Forest Metrics:
MSE: 22175.685892829762
RMSE: 148.91502910327677
MAE: 4.680855381329197
R²: 0.9999997018285179


XGBoost Metrics:
MSE: 2463183.2132457453
RMSE: 1569.4531573913716
MAE: 1066.407821298971
R²: 0.9999668803484718


In [155]:
metrics = {
    'Model': ['Linear Regression', 'Support Vector Machines', 'Random Forest', 'XGBoost'],
    'MSE': [mse_lr, mse_svr, mse_rf, mse_xgb],
    'RMSE': [rmse_lr, rmse_svr, rmse_rf, rmse_xgb],
    'MAE': [mae_lr, mae_svr, mae_rf, mae_xgb],
    'R²': [r2_lr, r2_svr, r2_rf, r2_xgb]
}

metrics_df = pd.DataFrame(metrics)
metrics_df


Unnamed: 0,Model,MSE,RMSE,MAE,R²
0,Linear Regression,516699900.0,22731.033208,12361.240596,0.993053
1,Support Vector Machines,79089150000.0,281227.924284,196740.409082,-0.063423
2,Random Forest,22175.69,148.915029,4.680855,1.0
3,XGBoost,2463183.0,1569.453157,1066.407821,0.999967


## Trying models again considering all columns (categorical and numerical). OneHotEncoder and Target Encoding will be used to convert the categorical columns.

In [171]:
# Dropping columns that are not relevant for the models
train_data = train_data.drop(columns = ['source.agents', 'other_listings.rdc', 'status', 'source.type', 'location.address.line', 'branding.type'])
test_data = test_data.drop(columns = ['source.agents', 'other_listings.rdc', 'status', 'source.type', 'location.address.line', 'branding.type'])

In [172]:
train_data.head()   

Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,description.type,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,products.brand_name,location.address.postal_code,location.address.coordinate.lat,location.address.state_code,location.county.fips_code,location.county.name,primary_photo,source,location.address.coordinate,other_listings,branding.name,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin
0,1703622000.0,1244000.0,2591419000.0,False,2961534000.0,True,1977.0,1703548800,1195000.0,3.0,37462.0,2903.0,3.0,2.0,1.0,3.0,single_family,True,False,False,essentials,85028.0,33.564398,AZ,4013.0,Maricopa,True,True,True,True,Axis Property Advisors,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1705453000.0,1075000.0,4291892000.0,False,2959884000.0,False,1910.0,1705363200,950000.0,1.0,8159.0,1528.0,2.0,2.0,2.0,3.0,condos,True,False,False,basic_opt_in,2908.0,42.379463,MA,25025.0,Suffolk,True,True,True,True,Mgs Group Real Estate,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1702920000.0,245000.0,9438858000.0,False,2960056000.0,False,2018.0,1702857600,250000.0,2.0,2000.0,1220.0,2.0,2.0,2.0,3.0,townhomes,True,False,False,basic_opt_in,19901.0,39.160328,DE,10001.0,Kent,True,True,True,True,Iron Valley Real Estate at The Beach,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1705335000.0,225000.0,5790648000.0,False,2962364000.0,False,1994.0,1705017600,211000.0,2.0,6098.0,1054.0,2.0,2.0,1.0,3.0,single_family,True,False,False,essentials,32311.0,32.348511,FL,12073.0,Leon,True,True,True,True,Keller Williams Town & Country,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1702114000.0,211000.0,3958063000.0,False,2961431000.0,False,1973.0,1701993600,205500.0,2.0,1363.0,1509.0,2.0,2.0,1.0,4.0,single_family,True,False,False,essentials,40601.0,38.176858,KY,21073.0,Franklin,True,True,True,True,Plum Tree Realty,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [173]:
# Checking how many unique values there are in each of the categorical columns to decide which method is appropriate for encoding.

categorical_columns = ['description.type', 'products.brand_name', 'location.address.state_code', 'location.county.name', 'branding.name']

def counts(columns):
    for category in columns:
        #print(f'\n{category} value count {train_data[category].value_counts()}')
        print(f'Unique {category} values sum', train_data[category].nunique())
     
counts(categorical_columns)

Unique description.type values sum 9
Unique products.brand_name values sum 4
Unique location.address.state_code values sum 34
Unique location.county.name values sum 53
Unique branding.name values sum 826


### Target Encoding for column Branding Name

In [174]:
# Calculate mean target encoding
mean_encoding = train_data.groupby('branding.name')['description.sold_price'].mean()

# Map the mean values to the branding.name column
train_data['branding.name_encoded'] = train_data['branding.name'].map(mean_encoding)

# Fill missing values with the overall mean
train_data['branding.name_encoded'].fillna(train_data['description.sold_price'].mean(), inplace=True)

# Drop the original 'branding.name'
train_data.drop('branding.name', axis=1, inplace=True)


In [175]:
# Calculate mean target encoding
mean_encoding = test_data.groupby('branding.name')['description.sold_price'].mean()

# Map the mean values to the branding.name column
test_data['branding.name_encoded'] = test_data['branding.name'].map(mean_encoding)

# Fill missing values with the overall mean
test_data['branding.name_encoded'].fillna(test_data['description.sold_price'].mean(), inplace=True)

# Drop the original 'branding.name'
test_data.drop('branding.name', axis=1, inplace=True)


In [176]:
train_data.head()

Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,description.type,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,products.brand_name,location.address.postal_code,location.address.coordinate.lat,location.address.state_code,location.county.fips_code,location.county.name,primary_photo,source,location.address.coordinate,other_listings,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin,branding.name_encoded
0,1703622000.0,1244000.0,2591419000.0,False,2961534000.0,True,1977.0,1703548800,1195000.0,3.0,37462.0,2903.0,3.0,2.0,1.0,3.0,single_family,True,False,False,essentials,85028.0,33.564398,AZ,4013.0,Maricopa,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1195000.0
1,1705453000.0,1075000.0,4291892000.0,False,2959884000.0,False,1910.0,1705363200,950000.0,1.0,8159.0,1528.0,2.0,2.0,2.0,3.0,condos,True,False,False,basic_opt_in,2908.0,42.379463,MA,25025.0,Suffolk,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,950000.0
2,1702920000.0,245000.0,9438858000.0,False,2960056000.0,False,2018.0,1702857600,250000.0,2.0,2000.0,1220.0,2.0,2.0,2.0,3.0,townhomes,True,False,False,basic_opt_in,19901.0,39.160328,DE,10001.0,Kent,True,True,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,306317.8
3,1705335000.0,225000.0,5790648000.0,False,2962364000.0,False,1994.0,1705017600,211000.0,2.0,6098.0,1054.0,2.0,2.0,1.0,3.0,single_family,True,False,False,essentials,32311.0,32.348511,FL,12073.0,Leon,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,324381.1
4,1702114000.0,211000.0,3958063000.0,False,2961431000.0,False,1973.0,1701993600,205500.0,2.0,1363.0,1509.0,2.0,2.0,1.0,4.0,single_family,True,False,False,essentials,40601.0,38.176858,KY,21073.0,Franklin,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,202282.3


### One Hot Encoding for Description Type

In [177]:
# Perform OHE for 'description.type' and 'products.brand_name'
train_data_encoded = pd.get_dummies(train_data, columns=['description.type', 'products.brand_name'])

# Display the encoded DataFrame
train_data_encoded.head()


Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,location.address.postal_code,location.address.coordinate.lat,location.address.state_code,location.county.fips_code,location.county.name,primary_photo,source,location.address.coordinate,other_listings,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin,branding.name_encoded,description.type_apartment,description.type_condo_townhome_rowhome_coop,description.type_condos,description.type_duplex_triplex,description.type_land,description.type_mobile,description.type_multi_family,description.type_single_family,description.type_townhomes,products.brand_name_advantage_brand,products.brand_name_advantage_pro,products.brand_name_basic_opt_in,products.brand_name_essentials
0,1703622000.0,1244000.0,2591419000.0,False,2961534000.0,True,1977.0,1703548800,1195000.0,3.0,37462.0,2903.0,3.0,2.0,1.0,3.0,True,False,False,85028.0,33.564398,AZ,4013.0,Maricopa,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1195000.0,False,False,False,False,False,False,False,True,False,False,False,False,True
1,1705453000.0,1075000.0,4291892000.0,False,2959884000.0,False,1910.0,1705363200,950000.0,1.0,8159.0,1528.0,2.0,2.0,2.0,3.0,True,False,False,2908.0,42.379463,MA,25025.0,Suffolk,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,950000.0,False,False,True,False,False,False,False,False,False,False,False,True,False
2,1702920000.0,245000.0,9438858000.0,False,2960056000.0,False,2018.0,1702857600,250000.0,2.0,2000.0,1220.0,2.0,2.0,2.0,3.0,True,False,False,19901.0,39.160328,DE,10001.0,Kent,True,True,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,306317.8,False,False,False,False,False,False,False,False,True,False,False,True,False
3,1705335000.0,225000.0,5790648000.0,False,2962364000.0,False,1994.0,1705017600,211000.0,2.0,6098.0,1054.0,2.0,2.0,1.0,3.0,True,False,False,32311.0,32.348511,FL,12073.0,Leon,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,324381.1,False,False,False,False,False,False,False,True,False,False,False,False,True
4,1702114000.0,211000.0,3958063000.0,False,2961431000.0,False,1973.0,1701993600,205500.0,2.0,1363.0,1509.0,2.0,2.0,1.0,4.0,True,False,False,40601.0,38.176858,KY,21073.0,Franklin,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,202282.3,False,False,False,False,False,False,False,True,False,False,False,False,True


In [178]:
# Perform OHE for 'description.type' and 'products.brand_name'
test_data_encoded = pd.get_dummies(test_data, columns=['description.type', 'products.brand_name'])

# Display the encoded DataFrame
test_data_encoded.head()


Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,location.address.postal_code,location.address.coordinate.lat,location.address.state_code,location.county.fips_code,location.county.name,primary_photo,source,location.address.coordinate,other_listings,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin,branding.name_encoded,description.type_apartment,description.type_condo_townhome_rowhome_coop,description.type_condos,description.type_duplex_triplex,description.type_land,description.type_mobile,description.type_multi_family,description.type_single_family,description.type_townhomes,products.brand_name_advantage_brand,products.brand_name_advantage_pro,products.brand_name_basic_opt_in,products.brand_name_essentials
0,1702909000.0,525000.0,5989887000.0,False,2960242000.0,False,1986.0,1702857600,518444.0,2.0,8159.0,1280.0,2.0,2.0,1.0,2.0,True,False,False,21403.0,38.960498,MD,24003.0,Anne Arundel,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,518444.0,False,False,True,False,False,False,False,False,False,False,False,False,True
1,1697030000.0,235000.0,9052323000.0,False,2960437000.0,False,2018.0,1696896000,227500.0,2.0,5227.0,1342.0,2.0,1.0,1.0,2.0,True,False,False,29045.0,34.110015,SC,45079.0,Richland,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,227500.0,False,False,False,False,False,False,False,True,False,False,False,True,False
2,1701441000.0,345000.0,2276326000.0,False,2961095000.0,False,1936.0,1701302400,345000.0,1.0,1363.0,850.0,1.0,2.0,1.0,2.0,True,False,False,80207.0,39.760841,CO,8031.0,Denver,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,345000.0,False,False,True,False,False,False,False,False,False,False,False,True,False
3,1705247000.0,299000.0,5834621000.0,False,2962272000.0,False,2006.0,1705017600,299000.0,2.0,6970.0,1706.0,2.0,2.0,1.0,3.0,True,False,False,32311.0,32.348511,FL,12073.0,Leon,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,313181.818182,False,False,False,False,False,False,False,True,False,False,False,False,True
4,1705326000.0,360000.0,3489076000.0,False,2960891000.0,False,1984.0,1704931200,325000.0,1.0,8159.0,1445.0,2.0,1.0,2.0,2.0,True,False,False,2908.0,41.875847,RI,44007.0,Providence,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,465702.702703,False,False,True,False,False,False,False,False,False,False,False,False,True


### Brand Name columns and Ordinal Encoded for State Code and County Name

In [179]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize the OrdinalEncoder
encoder = OrdinalEncoder()

# Perform Ordinal Encoding for 'location.address.state_code' and 'location.county.name'
train_data_encoded[['state_code_enc', 'county_name_enc']] = encoder.fit_transform(train_data_encoded[['location.address.state_code', 'location.county.name']])

# Drop the original columns after encoding
train_data_encoded.drop(columns=['location.address.state_code', 'location.county.name'], inplace=True)

# Display the encoded DataFrame
train_data_encoded.head()



Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,location.address.postal_code,location.address.coordinate.lat,location.county.fips_code,primary_photo,source,location.address.coordinate,other_listings,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin,branding.name_encoded,description.type_apartment,description.type_condo_townhome_rowhome_coop,description.type_condos,description.type_duplex_triplex,description.type_land,description.type_mobile,description.type_multi_family,description.type_single_family,description.type_townhomes,products.brand_name_advantage_brand,products.brand_name_advantage_pro,products.brand_name_basic_opt_in,products.brand_name_essentials,state_code_enc,county_name_enc
0,1703622000.0,1244000.0,2591419000.0,False,2961534000.0,True,1977.0,1703548800,1195000.0,3.0,37462.0,2903.0,3.0,2.0,1.0,3.0,True,False,False,85028.0,33.564398,4013.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1195000.0,False,False,False,False,False,False,False,True,False,False,False,False,True,2.0,33.0
1,1705453000.0,1075000.0,4291892000.0,False,2959884000.0,False,1910.0,1705363200,950000.0,1.0,8159.0,1528.0,2.0,2.0,2.0,3.0,True,False,False,2908.0,42.379463,25025.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,950000.0,False,False,True,False,False,False,False,False,False,False,False,True,False,13.0,49.0
2,1702920000.0,245000.0,9438858000.0,False,2960056000.0,False,2018.0,1702857600,250000.0,2.0,2000.0,1220.0,2.0,2.0,2.0,3.0,True,False,False,19901.0,39.160328,10001.0,True,True,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,306317.8,False,False,False,False,False,False,False,False,True,False,False,True,False,6.0,29.0
3,1705335000.0,225000.0,5790648000.0,False,2962364000.0,False,1994.0,1705017600,211000.0,2.0,6098.0,1054.0,2.0,2.0,1.0,3.0,True,False,False,32311.0,32.348511,12073.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,324381.1,False,False,False,False,False,False,False,True,False,False,False,False,True,7.0,31.0
4,1702114000.0,211000.0,3958063000.0,False,2961431000.0,False,1973.0,1701993600,205500.0,2.0,1363.0,1509.0,2.0,2.0,1.0,4.0,True,False,False,40601.0,38.176858,21073.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,202282.3,False,False,False,False,False,False,False,True,False,False,False,False,True,12.0,19.0


In [180]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize the OrdinalEncoder
encoder = OrdinalEncoder()

# Perform Ordinal Encoding for 'location.address.state_code' and 'location.county.name'
test_data_encoded[['state_code_enc', 'county_name_enc']] = encoder.fit_transform(test_data_encoded[['location.address.state_code', 'location.county.name']])

# Drop the original columns after encoding
test_data_encoded.drop(columns=['location.address.state_code', 'location.county.name'], inplace=True)

# Display the encoded DataFrame
test_data_encoded.head()



Unnamed: 0,last_update_date,list_price,property_id,community,listing_id,matterport,description.year_built,description.sold_date,description.sold_price,description.baths_full,description.lot_sqft,description.sqft,description.baths,description.garage,description.stories,description.beds,lead_attributes.show_contact_an_agent,flags.is_price_reduced,flags.is_foreclosure,location.address.postal_code,location.address.coordinate.lat,location.county.fips_code,primary_photo,source,location.address.coordinate,other_listings,tags_baseball,tags_basement,tags_basketball,tags_basketball_court,tags_beach,tags_beautiful_backyard,tags_big_bathroom,tags_big_lot,tags_big_yard,tags_boat_dock,tags_carport,tags_cathedral_ceiling,tags_central_air,tags_central_heat,tags_city_view,tags_clubhouse,tags_coffer_ceiling,tags_community_boat_facilities,tags_community_center,tags_community_clubhouse,tags_community_elevator,tags_community_golf,tags_community_gym,tags_community_horse_facilities,tags_community_outdoor_space,tags_community_park,tags_community_security_features,tags_community_spa_or_hot_tub,tags_community_swimming_pool,tags_community_tennis_court,tags_corner_lot,tags_courtyard_entry,tags_cul_de_sac,tags_den_or_office,tags_detached_guest_house,tags_dining_room,tags_disability_features,tags_dishwasher,tags_dual_master_bedroom,tags_efficient,tags_elevator,tags_energy_efficient,tags_ensuite,tags_exposed_brick,tags_family_room,tags_farm,tags_fenced_courtyard,tags_fenced_yard,tags_fireplace,tags_first_floor_master_bedroom,tags_fixer_upper,tags_floor_plan,tags_forced_air,tags_front_porch,tags_fruit_trees,tags_furniture,tags_game_room,tags_garage_1_or_more,tags_garage_2_or_more,tags_garage_3_or_more,tags_gated_community,tags_golf_course,tags_golf_course_lot_or_frontage,tags_golf_course_view,tags_gourmet_kitchen,tags_granite_kitchen,tags_greenbelt,tags_greenhouse,tags_groundscare,tags_guest_house,tags_guest_parking,tags_handicap_access,tags_hardwood_floors,tags_high_ceiling,tags_hill_or_mountain_view,tags_hoa,tags_horse_facilities,tags_indoor_basketball_court,tags_investment_opportunity,tags_jack_and_jill_bathroom,tags_kitchen_island,tags_lake,tags_lake_view,tags_large_kitchen,tags_large_porch,tags_laundry_room,tags_library,tags_low_hoa,tags_maintenance,tags_marina,tags_master_bathroom,tags_master_bedroom,tags_master_suite,tags_media_room,tags_medicalcare,tags_modern_kitchen,tags_mountain_view,tags_new_roof,tags_no_hoa,tags_ocean_view,tags_open_floor_plan,tags_open_house,tags_open_kitchen,tags_outbuilding,tags_outdoor_kitchen,tags_park,tags_pets_allowed,tags_playground,tags_pond,tags_private_backyard,tags_private_bathroom,tags_private_courtyard,tags_private_parking,tags_ranch,tags_recreation_facilities,tags_rental_property,tags_river_access,tags_river_view,tags_rv_or_boat_parking,tags_rv_parking,tags_screen_porch,tags_security,tags_senior_community,tags_shopping,tags_single_story,tags_smart_homes,tags_soccer,tags_solar_panels,tags_solar_system,tags_spa_or_hot_tub,tags_storm_shelter,tags_swimming_pool,tags_tennis,tags_tennis_court,tags_theater_room,tags_trails,tags_two_kitchen,tags_two_master_suites,tags_two_or_more_stories,tags_updated_kitchen,tags_vaulted_ceiling,tags_view,tags_views,tags_volleyball,tags_washer_dryer,tags_water_view,tags_waterfront,tags_well_water,tags_white_kitchen,tags_wine_cellar,tags_wooded_land,tags_wrap_around_porch,location.address.city_Albany,location.address.city_Alexander,location.address.city_Ampthill,location.address.city_Annapolis,location.address.city_Antelope,location.address.city_Antioch,location.address.city_Arden Hills,location.address.city_Atlanta,location.address.city_Boone,location.address.city_Boston,location.address.city_Canal Winchester,location.address.city_Carson City,location.address.city_Cave Creek,location.address.city_Charleston,location.address.city_Charlton Heights,location.address.city_Columbia,location.address.city_Columbus,location.address.city_Concord,location.address.city_Cross Lanes,location.address.city_Denver,location.address.city_Des Moines,location.address.city_Dover,location.address.city_Dublin,location.address.city_Edmond,location.address.city_Elgin,location.address.city_Elk Grove,location.address.city_Frankfort,location.address.city_Galloway,location.address.city_Glendale,location.address.city_Grove City,location.address.city_Guilderland,location.address.city_Hamilton,location.address.city_Hapeville,location.address.city_Harrisburg,location.address.city_Hartford,location.address.city_Henrico,location.address.city_Hermitage,location.address.city_Highland Springs,location.address.city_Honolulu,location.address.city_Indianapolis,location.address.city_Joelton,location.address.city_Lansing,location.address.city_Lawrence,location.address.city_Lawrenceville,location.address.city_Lincoln,location.address.city_Little Canada,location.address.city_Little Rock,location.address.city_Loudonville,location.address.city_Mabelvale,location.address.city_Madison,location.address.city_McFarland,location.address.city_Menands,location.address.city_Midwest City,location.address.city_Montgomery,location.address.city_Mustang,location.address.city_Nashville,location.address.city_North Providence,location.address.city_Oklahoma City,location.address.city_Old Hickory,location.address.city_Olympia,location.address.city_Pegram,location.address.city_Phoenix,location.address.city_Piedmont,location.address.city_Providence,location.address.city_Raleigh,location.address.city_Richmond,location.address.city_Robbinsville,location.address.city_Sacramento,location.address.city_Saint Paul,location.address.city_Salem,location.address.city_Sandy Springs,location.address.city_Shoreview,location.address.city_South Charleston,location.address.city_Spencer,location.address.city_Springfield,location.address.city_Tallahassee,location.address.city_Tolleson,location.address.city_Trenton,location.address.city_Tumwater,location.address.city_Verona,location.address.city_Westerville,location.address.city_Yukon,location.address.state_Alabama,location.address.state_Arizona,location.address.state_Arkansas,location.address.state_California,location.address.state_Colorado,location.address.state_Connecticut,location.address.state_Delaware,location.address.state_Florida,location.address.state_Georgia,location.address.state_Hawaii,location.address.state_Illinois,location.address.state_Indiana,location.address.state_Iowa,location.address.state_Kentucky,location.address.state_Maryland,location.address.state_Massachusetts,location.address.state_Michigan,location.address.state_Minnesota,location.address.state_Nebraska,location.address.state_Nevada,location.address.state_New Hampshire,location.address.state_New Jersey,location.address.state_New York,location.address.state_North Carolina,location.address.state_Ohio,location.address.state_Oklahoma,location.address.state_Oregon,location.address.state_Pennsylvania,location.address.state_Rhode Island,location.address.state_South Carolina,location.address.state_Tennessee,location.address.state_Virginia,location.address.state_Washington,location.address.state_West Virginia,location.address.state_Wisconsin,branding.name_encoded,description.type_apartment,description.type_condo_townhome_rowhome_coop,description.type_condos,description.type_duplex_triplex,description.type_land,description.type_mobile,description.type_multi_family,description.type_single_family,description.type_townhomes,products.brand_name_advantage_brand,products.brand_name_advantage_pro,products.brand_name_basic_opt_in,products.brand_name_essentials,state_code_enc,county_name_enc
0,1702909000.0,525000.0,5989887000.0,False,2960242000.0,False,1986.0,1702857600,518444.0,2.0,8159.0,1280.0,2.0,2.0,1.0,2.0,True,False,False,21403.0,38.960498,24003.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,518444.0,False,False,True,False,False,False,False,False,False,False,False,False,True,14.0,3.0
1,1697030000.0,235000.0,9052323000.0,False,2960437000.0,False,2018.0,1696896000,227500.0,2.0,5227.0,1342.0,2.0,1.0,1.0,2.0,True,False,False,29045.0,34.110015,45079.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,227500.0,False,False,False,False,False,False,False,True,False,False,False,True,False,28.0,43.0
2,1701441000.0,345000.0,2276326000.0,False,2961095000.0,False,1936.0,1701302400,345000.0,1.0,1363.0,850.0,1.0,2.0,1.0,2.0,True,False,False,80207.0,39.760841,8031.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,345000.0,False,False,True,False,False,False,False,False,False,False,False,True,False,4.0,16.0
3,1705247000.0,299000.0,5834621000.0,False,2962272000.0,False,2006.0,1705017600,299000.0,2.0,6970.0,1706.0,2.0,2.0,1.0,3.0,True,False,False,32311.0,32.348511,12073.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,313181.818182,False,False,False,False,False,False,False,True,False,False,False,False,True,7.0,31.0
4,1705326000.0,360000.0,3489076000.0,False,2960891000.0,False,1984.0,1704931200,325000.0,1.0,8159.0,1445.0,2.0,1.0,2.0,2.0,True,False,False,2908.0,41.875847,44007.0,True,True,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,465702.702703,False,False,True,False,False,False,False,False,False,False,False,False,True,27.0,40.0


#### Splitting into X_train, y_train, X_test and y_test

In [181]:
X_train = train_data_encoded.drop(columns='description.sold_price')
y_train = train_data_encoded['description.sold_price']

X_test = test_data_encoded.drop(columns='description.sold_price')
y_test = test_data_encoded['description.sold_price']

# Converting y_train and y_test into Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

### Trying all models (Linear Regression, Support Vector Machines, Random Forest and XGBoost) with all columns (categorical and numerical)

In [182]:
# Linear Regression
# Initialize model
lr_model_all = LinearRegression()

# Train the model
lr_model_all.fit(X_train, y_train)

# Predict on test set
y_pred_lr_all = lr_model_all.predict(X_test)

# Support Vector Machines
# Initialize model
svr_model_all = SVR()

# Train the model
svr_model_all.fit(X_train, y_train)

# Predict on the test set
y_pred_svr_all = svr_model_all.predict(X_test)

# Random Forest
# Initialize the model
rf_model_all = RandomForestRegressor(random_state=42)

# Train the model
rf_model_all.fit(X_train, y_train)

# Predict on the test set
y_pred_rf_all = rf_model_all.predict(X_test)

# XGBoost
# Initialize the model
xgb_model_all = xgb.XGBRegressor(random_state=42)

# Train the model
xgb_model_all.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb_all = xgb_model_all.predict(X_test)

In [183]:
# Evaluate Linear Regression Model
mse_lr_all = mean_squared_error(y_test, y_pred_lr_all)
rmse_lr_all = mean_squared_error(y_test, y_pred_lr_all, squared=False)
mae_lr_all = mean_absolute_error(y_test, y_pred_lr_all)
r2_lr_all = r2_score(y_test, y_pred_lr_all)

print("Linear Regression Metrics with all columns:")
print(f"MSE: {mse_lr_all}")
print(f"RMSE: {rmse_lr_all}")
print(f"MAE: {mae_lr_all}")
print(f"R²: {r2_lr_all}")

# Evaluate Support Vector Machines Model
mse_svr_all = mean_squared_error(y_test, y_pred_svr_all)
rmse_svr_all = mean_squared_error(y_test, y_pred_svr_all, squared=False)
mae_svr_all = mean_absolute_error(y_test, y_pred_svr_all)
r2_svr_all = r2_score(y_test, y_pred_svr_all)

print("\n\nSupport Vector Machines Metrics with all colums:")
print(f"MSE: {mse_svr_all}")
print(f"RMSE: {rmse_svr_all}")
print(f"MAE: {mae_svr_all}")
print(f"R²: {r2_svr_all}")

# Evaluate Random Forest model
mse_rf_all = mean_squared_error(y_test, y_pred_rf_all)
rmse_rf_all = mean_squared_error(y_test, y_pred_rf_all, squared=False)
mae_rf_all = mean_absolute_error(y_test, y_pred_rf_all)
r2_rf_all = r2_score(y_test, y_pred_rf_all)

print("\n\nRandom Forest Metrics with all columns:")
print(f"MSE: {mse_rf_all}")
print(f"RMSE: {rmse_rf_all}")
print(f"MAE: {mae_rf_all}")
print(f"R²: {r2_rf_all}")

# Evaluate XGBoost model
mse_xgb_all = mean_squared_error(y_test, y_pred_xgb_all)
rmse_xgb_all = mean_squared_error(y_test, y_pred_xgb_all, squared=False)
mae_xgb_all = mean_absolute_error(y_test, y_pred_xgb_all)
r2_xgb_all = r2_score(y_test, y_pred_xgb_all)

print("\n\nXGBoost Metrics with all columns:")
print(f"MSE: {mse_xgb_all}")
print(f"RMSE: {rmse_xgb_all}")
print(f"MAE: {mae_xgb_all}")
print(f"R²: {r2_xgb_all}")

Linear Regression Metrics with all columns:
MSE: 501994675.1467998
RMSE: 22405.237672178348
MAE: 12312.011893124785
R²: 0.9932502427669829


Support Vector Machines Metrics with all colums:
MSE: 79089224915.65018
RMSE: 281228.0656613955
MAE: 196740.4969980516
R²: -0.06342376594335519


Random Forest Metrics with all columns:
MSE: 9571153.351139225
RMSE: 3093.7280667730356
MAE: 481.52063740958295
R²: 0.9998713074764364


XGBoost Metrics with all columns:
MSE: 10739504.370750105
RMSE: 3277.118302831026
MAE: 1424.8510311483567
R²: 0.9998555979756473


In [189]:
from IPython.display import display

metrics_all = {
    'Model All': ['Linear Regression All', 'Support Vector Machines All', 'Random Forest All', 'XGBoost All'],
    'MSE All': [mse_lr_all, mse_svr_all, mse_rf_all, mse_xgb_all],
    'RMSE All': [rmse_lr_all, rmse_svr_all, rmse_rf_all, rmse_xgb_all],
    'MAE All': [mae_lr_all, mae_svr_all, mae_rf_all, mae_xgb_all],
    'R² All': [r2_lr_all, r2_svr_all, r2_rf_all, r2_xgb_all]
}

metrics_all_df = pd.DataFrame(metrics_all)

display(metrics_df, metrics_all_df)



Unnamed: 0,Model,MSE,RMSE,MAE,R²
0,Linear Regression,516699900.0,22731.033208,12361.240596,0.993053
1,Support Vector Machines,79089150000.0,281227.924284,196740.409082,-0.063423
2,Random Forest,22175.69,148.915029,4.680855,1.0
3,XGBoost,2463183.0,1569.453157,1066.407821,0.999967


Unnamed: 0,Model All,MSE All,RMSE All,MAE All,R² All
0,Linear Regression All,501994700.0,22405.237672,12312.011893,0.99325
1,Support Vector Machines All,79089220000.0,281228.065661,196740.496998,-0.063424
2,Random Forest All,9571153.0,3093.728067,481.520637,0.999871
3,XGBoost All,10739500.0,3277.118303,1424.851031,0.999856


**STRETCH**

Even with all the preprocessing we did in Notebook 1, you probably still have a lot of features. Are they all important for prediction?

Investigate some feature selection algorithms (Lasso, RFE, Forward/Backward Selection)
- Perform feature selection to get a reduced subset of your original features
- Refit your models with this reduced dimensionality - how does performance change on your chosen metrics?
- Based on this, should you include feature selection in your final pipeline? Explain

Remember, feature selection often doesn't directly improve performance, but if performance remains the same, a simpler model is often preferrable. 



## Feature Selection

In [194]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Lasso for feature selection
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
selected_features = X_train.columns[lasso.coef_ != 0]


# RFE for feature selection with specific number of features
rfe = RFE(RandomForestRegressor(), n_features_to_select=5)  
rfe.fit(X_train_scaled, y_train)
selected_features_rfe = X_train.columns[rfe.support_]

# # RFECV for feature selection with cross-validation
# rfe = RFECV(estimator=RandomForestRegressor(), step=1, cv=5, scoring='neg_mean_squared_error')
# rfe.fit(X_train_scaled, y_train)

# selected_features_rfe = X_train.columns[rfe.support_]
# print("Optimal number of features:", rfe.n_features_)
# print("Selected features by RFECV:", selected_features_rfe)


### Refit Models

In [197]:
import numpy as np
# Refit models with selected features using Lasso
# Ensure `selected_features` is a boolean array
selected_features = np.array(lasso.coef_ != 0)
X_train_selected = X_train_scaled[:, selected_features]
X_test_selected = X_test_scaled[:, selected_features]

# Linear Regression
# Initialize model
lr_model_lasso = LinearRegression()

# Train the model
lr_model_lasso.fit(X_train_selected, y_train)

# Predict on test set
y_pred_lr_lasso = lr_model_lasso.predict(X_test_selected)

# Support Vector Machines
# Initialize model
svr_model_lasso = SVR()

# Train the model
svr_model_lasso.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_svr_lasso = svr_model_lasso.predict(X_test_selected)

# Random Forest
# Initialize the model
rf_model_lasso = RandomForestRegressor(random_state=42)

# Train the model
rf_model_lasso.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_rf_lasso = rf_model_lasso.predict(X_test_selected)

# XGBoost
# Initialize the model
xgb_model_lasso = xgb.XGBRegressor(random_state=42)

# Train the model
xgb_model_lasso.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_xgb_lasso = xgb_model_lasso.predict(X_test_selected)

In [198]:
# Evaluate Linear Regression Model
mse_lr_lasso = mean_squared_error(y_test, y_pred_lr_lasso)
rmse_lr_lasso = mean_squared_error(y_test, y_pred_lr_lasso, squared=False)
mae_lr_lasso = mean_absolute_error(y_test, y_pred_lr_lasso)
r2_lr_lasso = r2_score(y_test, y_pred_lr_lasso)

print("Linear Regression Metrics with Lasso Regularization:")
print(f"MSE: {mse_lr_lasso}")
print(f"RMSE: {rmse_lr_lasso}")
print(f"MAE: {mae_lr_lasso}")
print(f"R²: {r2_lr_lasso}")

# Evaluate Support Vector Machines Model
mse_svr_lasso = mean_squared_error(y_test, y_pred_svr_lasso)
rmse_svr_lasso = mean_squared_error(y_test, y_pred_svr_lasso, squared=False)
mae_svr_lasso = mean_absolute_error(y_test, y_pred_svr_lasso)
r2_svr_lasso = r2_score(y_test, y_pred_svr_lasso)

print("\n\nSupport Vector Machines Metrics with Lasso Regularization:")
print(f"MSE: {mse_svr_lasso}")
print(f"RMSE: {rmse_svr_lasso}")
print(f"MAE: {mae_svr_lasso}")
print(f"R²: {r2_svr_lasso}")

# Evaluate Random Forest model
mse_rf_lasso = mean_squared_error(y_test, y_pred_rf_lasso)
rmse_rf_lasso = mean_squared_error(y_test, y_pred_rf_lasso, squared=False)
mae_rf_lasso = mean_absolute_error(y_test, y_pred_rf_lasso)
r2_rf_lasso = r2_score(y_test, y_pred_rf_lasso)

print("\n\nRandom Forest Metrics with Lasso Regularization:")
print(f"MSE: {mse_rf_lasso}")
print(f"RMSE: {rmse_rf_lasso}")
print(f"MAE: {mae_rf_lasso}")
print(f"R²: {r2_rf_lasso}")

# Evaluate XGBoost model
mse_xgb_lasso = mean_squared_error(y_test, y_pred_xgb_lasso)
rmse_xgb_lasso = mean_squared_error(y_test, y_pred_xgb_lasso, squared=False)
mae_xgb_lasso = mean_absolute_error(y_test, y_pred_xgb_lasso)
r2_xgb_lasso = r2_score(y_test, y_pred_xgb_lasso)

print("\n\nXGBoost Metrics with Lasso Regularization:")
print(f"MSE: {mse_xgb_lasso}")
print(f"RMSE: {rmse_xgb_lasso}")
print(f"MAE: {mae_xgb_lasso}")
print(f"R²: {r2_xgb_lasso}")

Linear Regression Metrics with Lasso Regularization:
MSE: 502294753.34415656
RMSE: 22411.93327993274
MAE: 12346.503114047387
R²: 0.993246207953302


Support Vector Machines Metrics with Lasso Regularization:
MSE: 79075692452.53055
RMSE: 281204.00504354585
MAE: 196711.01995041044
R²: -0.06324181014711994


Random Forest Metrics with Lasso Regularization:
MSE: 9864777.483899461
RMSE: 3140.824331907065
MAE: 481.96070683042865
R²: 0.9998673594432957


XGBoost Metrics with Lasso Regularization:
MSE: 11856252.843583621
RMSE: 3443.2909902567953
MAE: 1484.5223539370843
R²: 0.9998405823162088


In [201]:
# Refit models with selected features using Recursive Feature Elimination (RFE)
# Ensure `selected_features` is a boolean array
selected_features_rfe = np.array(rfe.support_)
X_train_selected_rfe = X_train_scaled[:, selected_features_rfe]
X_test_selected_rfe = X_test_scaled[:, selected_features_rfe]

# Linear Regression
# Initialize model
lr_model_rfe = LinearRegression()

# Train the model
lr_model_rfe.fit(X_train_selected_rfe, y_train)

# Predict on test set
y_pred_lr_rfe = lr_model_rfe.predict(X_test_selected_rfe)

# Support Vector Machines
# Initialize model
svr_model_rfe = SVR()

# Train the model
svr_model_rfe.fit(X_train_selected_rfe, y_train)

# Predict on the test set
y_pred_svr_rfe = svr_model_rfe.predict(X_test_selected_rfe)

# Random Forest
# Initialize the model
rf_model_rfe = RandomForestRegressor(random_state=42)

# Train the model
rf_model_rfe.fit(X_train_selected_rfe, y_train)

# Predict on the test set
y_pred_rf_rfe = rf_model_rfe.predict(X_test_selected_rfe)

# XGBoost
# Initialize the model
xgb_model_rfe = xgb.XGBRegressor(random_state=42)

# Train the model
xgb_model_rfe.fit(X_train_selected_rfe, y_train)

# Predict on the test set
y_pred_xgb_rfe = xgb_model_rfe.predict(X_test_selected_rfe)

In [202]:
# Evaluate Linear Regression Model
mse_lr_rfe = mean_squared_error(y_test, y_pred_lr_rfe)
rmse_lr_rfe = mean_squared_error(y_test, y_pred_lr_rfe, squared=False)
mae_lr_rfe = mean_absolute_error(y_test, y_pred_lr_rfe)
r2_lr_rfe = r2_score(y_test, y_pred_lr_rfe)

print("Linear Regression Metrics with RFE Regularization:")
print(f"MSE: {mse_lr_rfe}")
print(f"RMSE: {rmse_lr_rfe}")
print(f"MAE: {mae_lr_rfe}")
print(f"R²: {r2_lr_rfe}")

# Evaluate Support Vector Machines Model
mse_svr_rfe = mean_squared_error(y_test, y_pred_svr_rfe)
rmse_svr_rfe = mean_squared_error(y_test, y_pred_svr_rfe, squared=False)
mae_svr_rfe = mean_absolute_error(y_test, y_pred_svr_rfe)
r2_svr_rfe = r2_score(y_test, y_pred_svr_rfe)

print("\n\nSupport Vector Machines Metrics with RFE Regularization:")
print(f"MSE: {mse_svr_rfe}")
print(f"RMSE: {rmse_svr_rfe}")
print(f"MAE: {mae_svr_rfe}")
print(f"R²: {r2_svr_rfe}")

# Evaluate Random Forest model
mse_rf_rfe = mean_squared_error(y_test, y_pred_rf_rfe)
rmse_rf_rfe = mean_squared_error(y_test, y_pred_rf_rfe, squared=False)
mae_rf_rfe = mean_absolute_error(y_test, y_pred_rf_rfe)
r2_rf_rfe = r2_score(y_test, y_pred_rf_rfe)

print("\n\nRandom Forest Metrics with RFE Regularization:")
print(f"MSE: {mse_rf_rfe}")
print(f"RMSE: {rmse_rf_rfe}")
print(f"MAE: {mae_rf_rfe}")
print(f"R²: {r2_rf_rfe}")

# Evaluate XGBoost model
mse_xgb_rfe = mean_squared_error(y_test, y_pred_xgb_rfe)
rmse_xgb_rfe = mean_squared_error(y_test, y_pred_xgb_rfe, squared=False)
mae_xgb_rfe = mean_absolute_error(y_test, y_pred_xgb_rfe)
r2_xgb_rfe = r2_score(y_test, y_pred_xgb_rfe)

print("\n\nXGBoost Metrics with RFE Regularization:")
print(f"MSE: {mse_xgb_rfe}")
print(f"RMSE: {rmse_xgb_rfe}")
print(f"MAE: {mae_xgb_rfe}")
print(f"R²: {r2_xgb_rfe}")

Linear Regression Metrics with RFE Regularization:
MSE: 556927223.1231594
RMSE: 23599.305564426242
MAE: 13131.219253970463
R²: 0.9925116266393855


Support Vector Machines Metrics with RFE Regularization:
MSE: 77151629471.78914
RMSE: 277761.82147982315
MAE: 193396.74658696077
R²: -0.03737110142447886


Random Forest Metrics with RFE Regularization:
MSE: 51620477.41655997
RMSE: 7184.739203099857
MAE: 1441.2168949117495
R²: 0.9993059175563719


XGBoost Metrics with RFE Regularization:
MSE: 39268481.08072892
RMSE: 6266.456820303553
MAE: 2856.110962607408
R²: 0.9994720009447775


In [203]:
# gather evaluation metrics and compare to the previous step (full feature set)

# Lasso metrics
metrics_lasso = {
    'Model Lasso': ['Linear Regression Lasso', 'Support Vector Machines Lasso', 'Random Forest Lasso', 'XGBoost Lasso'],
    'MSE Lasso': [mse_lr_lasso, mse_svr_lasso, mse_rf_lasso, mse_xgb_lasso],
    'RMSE Lasso': [rmse_lr_lasso, rmse_svr_lasso, rmse_rf_lasso, rmse_xgb_lasso],
    'MAE Lasso': [mae_lr_lasso, mae_svr_lasso, mae_rf_lasso, mae_xgb_lasso],
    'R² Lasso': [r2_lr_lasso, r2_svr_lasso, r2_rf_lasso, r2_xgb_lasso]
}

metrics_lasso_df = pd.DataFrame(metrics_lasso)

# RFE metrics
metrics_rfe = {
    'Model RFE': ['Linear Regression RFE', 'Support Vector Machines RFE', 'Random Forest RFE', 'XGBoost RFE'],
    'MSE RFE': [mse_lr_rfe, mse_svr_rfe, mse_rf_rfe, mse_xgb_rfe],
    'RMSE RFE': [rmse_lr_rfe, rmse_svr_rfe, rmse_rf_rfe, rmse_xgb_rfe],
    'MAE RFE': [mae_lr_rfe, mae_svr_rfe, mae_rf_rfe, mae_xgb_rfe],
    'R² RFE': [r2_lr_rfe, r2_svr_rfe, r2_rf_rfe, r2_xgb_rfe]
}

metrics_rfe_df = pd.DataFrame(metrics_rfe)

display(metrics_all_df, metrics_lasso_df, metrics_rfe_df)


Unnamed: 0,Model All,MSE All,RMSE All,MAE All,R² All
0,Linear Regression All,501994700.0,22405.237672,12312.011893,0.99325
1,Support Vector Machines All,79089220000.0,281228.065661,196740.496998,-0.063424
2,Random Forest All,9571153.0,3093.728067,481.520637,0.999871
3,XGBoost All,10739500.0,3277.118303,1424.851031,0.999856


Unnamed: 0,Model Lasso,MSE Lasso,RMSE Lasso,MAE Lasso,R² Lasso
0,Linear Regression Lasso,502294800.0,22411.93328,12346.503114,0.993246
1,Support Vector Machines Lasso,79075690000.0,281204.005044,196711.01995,-0.063242
2,Random Forest Lasso,9864777.0,3140.824332,481.960707,0.999867
3,XGBoost Lasso,11856250.0,3443.29099,1484.522354,0.999841


Unnamed: 0,Model RFE,MSE RFE,RMSE RFE,MAE RFE,R² RFE
0,Linear Regression RFE,556927200.0,23599.305564,13131.219254,0.992512
1,Support Vector Machines RFE,77151630000.0,277761.82148,193396.746587,-0.037371
2,Random Forest RFE,51620480.0,7184.739203,1441.216895,0.999306
3,XGBoost RFE,39268480.0,6266.45682,2856.110963,0.999472
