# Group Notes

**Remaining to-do's:**
* use `get_coordinates_Nominatim()` function to create a feature that indicates the distance between a property and it's host_location
* explore dropping/cleaning observations with max nights > 365 or some large cutoff (there are ~30 instances with max nights > 2 billion)
* Get weighted score feature (num stars weighted by num review) working -- it currently throws an overflow errror and I haven't figured out why yet
* Target encode location cluster and all-features cluster (and also keep cluster variable) -- the sklearn `TargetEncoder()` throws and error and I haven't figured out why yet
* Explore stacking!!

# Imports and setup

In [1666]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, TargetEncoder, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.metrics.pairwise import haversine_distances
from sklearn.experimental import enable_iterative_imputer # MUST import this to enable IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import RandomForestRegressor # For feature importance

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from geopy.geocoders import Nominatim
from time import sleep
from random import randint
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

from scipy.sparse import hstack
from scipy.stats.mstats import hmean

from lightgbm import LGBMRegressor

pd.set_option('display.max_columns', None)

In [1667]:
TODAY = pd.to_datetime('2024-01-01')

In [1668]:
## Read in and preview data
df = pd.read_csv('../data/train.csv')
df = df.drop(columns=['Id'])
df['host_id'] = df['host_id'].astype(str)
df.head(2)

Unnamed: 0,price,name,neighborhood_overview,host_id,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,143.0,Guesthouse in Oakland · ★4.93 · 1 bedroom · 1 ...,You will be in the Crocker Highlands neighborh...,4211733,2012-11-21,"Oakland, CA",,within a few hours,100%,89%,f,1.0,"['email', 'phone']",t,t,Lakeshore,37.815091,-122.237531,Entire guesthouse,Entire home/apt,4,1 bath,,1.0,[],3,365,3,3,365,365,3.0,365.0,t,16,41,71,346,14,14,2,2023-04-28,2023-11-26,4.93,5.0,4.86,4.86,5.0,4.93,4.86,f
1,103.0,Rental unit in San Francisco · 1 bedroom · 1 b...,,1257432,2011-10-06,"San Francisco, CA","Our Company is San Francisco Life Real Estate,...",within an hour,98%,81%,t,32.0,"['email', 'phone', 'work_email']",t,t,Western Addition,37.770767,-122.427483,Entire rental unit,Entire home/apt,2,1 bath,,1.0,"[""Hair dryer"", ""TV"", ""Dishwasher"", ""Elevator"",...",30,365,30,30,365,365,30.0,365.0,t,17,40,43,317,2,2,0,2023-09-05,2023-10-31,5.0,4.5,4.5,5.0,5.0,5.0,4.5,f


In [1669]:
## Drop all instances with prices > 3000 to avoid using poor quality data
df = df[df.price <= 3000]

# nights_cols = [col for col in df.columns if 'nights' in col]
# for col in nights_cols:
#     df.loc[(df[col] > 2000), col] = 1125

#for col in 

## TODO: look at dropping max nights > 365 or some large cutoff (there are ~30 instances with max nights > 2 billion)

In [1670]:
## Split data into features and target
X_orig = df.drop('price', axis=1)
y_orig = df['price']

## Drop some columns right away -- they are not used and we currently have no plan to use them
DROP_COLS = ['name']#, 'host_location']#, 'host_about', 'host_has_profile_pic'] # TODO: look at overview again
X_orig = X_orig.drop(DROP_COLS, axis=1)

TODO: use this function to get distance between property and host_location

In [1671]:
def get_coordinates_Nominatim(address: str, sleep_seconds: int):
    geocoder = Nominatim(user_agent = 'trial'+str(randint(0,1000)))
    try:
        result = geocoder.geocode(address)
        sleep(randint(1 * 100, sleep_seconds * 100) / 100)
        if result:
            return result
        else:
            return None
    except GeocoderTimedOut:
        print('TIMED OUT: GeocoderTimedOut: Retrying...')
        sleep(randint(1 * 100, sleep_seconds * 100) / 100)
        return get_coordinates_Nominatim(address, sleep_seconds)
    except GeocoderServiceError as e:
        print('CONNECTION REFUSED: GeocoderServiceError encountered.')
        return None
    except Exception as e:
        print(f'ERROR: Terminating due to exception {e}')
        return None

# Pre-split engineering

Perform feature engineering tasks that should or can be performed on **both** train and test

In [1672]:
def pre_split_engineering(X):
    ## Fix boolean columns
    X['host_is_superhost'] = X['host_is_superhost'].map({'t': 1, 'f': 0})
    X['host_identity_verified'] = X['host_identity_verified'].map({'t': 1, 'f': 0})
    X['instant_bookable'] = X['instant_bookable'].map({'t': 1, 'f': 0})
    X['has_availability'] = X['has_availability'].map({'t': 1, 'f': 0})
    X['host_has_profile_pic'] = X['host_has_profile_pic'].map({'t': 1, 'f': 0})

    ## Fix percentage columns
    X['host_response_rate'] = X['host_response_rate'].str.replace('%', '').astype(float) / 100
    X['host_acceptance_rate'] = X['host_acceptance_rate'].str.replace('%', '').astype(float) / 100

    ## Convert dates to durations

    X['host_since'] = pd.to_datetime(X['host_since'])
    X['first_review'] = pd.to_datetime(X['first_review'])
    
    X['time_to_first_rev'] = (X['first_review'] - X['host_since']).dt.days

    X['host_lifetime'] = (TODAY - X['host_since']).dt.days # December 2023 
    X = X.drop('host_since', axis=1)

    X['days_since_first_review'] = (TODAY - X['first_review']).dt.days # December 2023
    X = X.drop('first_review', axis=1)

    X['days_since_last_review'] = (TODAY - pd.to_datetime(X['last_review'])).dt.days # December 2023
    X = X.drop('last_review', axis=1)

    X['property_type'] = X['property_type'].fillna('other')
    X.loc[X['property_type'].map(X['property_type'].value_counts() < 6),'property_type'] = 'other'

    ## Calculate availability percentage
    
    X['availability_30'] = X['availability_30'] / 30
    X['availability_60'] = X['availability_60'] / 60
    X['availability_90'] = X['availability_90'] / 90
    X['availability_365'] = X['availability_365'] / 365

    #X = X.drop(columns=['has_availability','availability_30','availability_60','availability_90','availability_365'])

    ## Add hours to host_response_time
    X['host_response_time'] = X['host_response_time'].map({
        'within an hour': 1,
        'within a few hours': 6,
        'within a day': 24,
        'a few days or more': 48,
    })

    X['has_overview'] = X.neighborhood_overview.notnull().astype(int)
    X = X.drop('neighborhood_overview', axis=1)

    X['has_host_about'] = X.host_about.notnull().astype(int)
    X = X.drop('host_about', axis=1)

    ## Indicate if number of review is 0
    X['has_reviews'] = X['number_of_reviews'] > 0
 
    ### ENGINEER LOCATION ###

    ## Extract number from bathrooms_text column
    X['n_bathrooms'] = X['bathrooms_text'].str.extract('(\d+)').astype(float)
    X = X.drop('bathrooms_text', axis=1)

    # ## Engineer latitude and longitude
    X['latitude_rad'] = np.radians(X['latitude'])
    X['longitude_rad'] = np.radians(X['longitude'])

    X['coord_x'] = np.cos(X['latitude']) * np.cos(X['longitude'])
    X['coord_y'] = np.cos(X['latitude']) * np.sin(X['longitude'])

    X['host_in_CA'] = X['host_location'].str.contains(', CA', case=False, na=False).astype(int)
    X = X.drop('host_location', axis=1)
    # X['coord_interaction'] = X['coord_x'] * X['coord_y']

    ## Calculate haversine distance to SF, Berkely, San Jose, Palo Alto, Santa Cruz

    sf_lat = 37.7749
    sf_lon = -122.4194
    sf_target = np.radians([sf_lat, sf_lon])
    X['dist_to_sf'] = haversine_distances(np.array([X['latitude_rad'],X['longitude_rad']]).T, sf_target.reshape(1, -1))

    berk_lat = 37.8715
    berk_lon = -122.2730
    berk_target = np.radians([berk_lat, berk_lon])
    X['dist_to_berkely'] = haversine_distances(np.array([X['latitude_rad'],X['longitude_rad']]).T, berk_target.reshape(1, -1))

    jose_lat = 37.3387
    jose_lon = -121.8853
    jose_target = np.radians([jose_lat, jose_lon])
    X['dist_to_jose'] = haversine_distances(np.array([X['latitude_rad'],X['longitude_rad']]).T, jose_target.reshape(1, -1))

    palo_lat = 37.4419
    palo_lon = -122.1430
    palo_target = np.radians([palo_lat, palo_lon])
    X['dist_to_sf'] = haversine_distances(np.array([X['latitude_rad'],X['longitude_rad']]).T, palo_target.reshape(1, -1))

    sc_lat = 36.9741
    sc_lon = -122.0308 
    sc_target = np.radians([sc_lat, sc_lon])
    X['dist_to_sc'] = haversine_distances(np.array([X['latitude_rad'],X['longitude_rad']]).T, sc_target.reshape(1, -1))

    X = X.drop(columns=['latitude', 'longitude'])

    ### TEXT TRANSFORMATIONS ###

    ## Convert stringified lists to comma separated text for CountVectorizer
    X['amenities'] = X['amenities'].fillna('None')
    X['host_verifications'] = X['host_verifications'].fillna('None')

    X['has_amenities'] = (X.amenities != '[]').astype(int)
    X['n_amenities'] = X['amenities'].apply(lambda x: len(x.split(',')))
    X = X.drop('amenities', axis=1)

    return X

**Pre-split drop** drops columns that we don't want to use. We drop these before post-split engineering because some post-split engineering (interactions via `PolynomialFeatures()`, KMeans, etc.) take a long time with too many columns

In [1673]:
def pre_split_drop(X):
    ## To drop a column from the final X dataframe, add it to this list
    # NOTE: commented columns are KEPT
    PRE_SPLIT_DROP = [
        #'host_id', 
        #'host_response_time', 
        #'host_response_rate',
        #'host_acceptance_rate', 
        #'host_is_superhost', 
        #'host_listings_count',
        'host_verifications', 
        'host_has_profile_pic', 
        #'host_identity_verified',
        #'neighbourhood_cleansed', # this changes results very little
        #'property_type',  
        #'room_type', 
        #'accommodates', 
        'bedrooms', 
        #'beds', 
        #'minimum_nights', 
        #'maximum_nights',
        'minimum_minimum_nights',
        'maximum_minimum_nights',
        'minimum_maximum_nights', 
        #'maximum_maximum_nights',
        'minimum_nights_avg_ntm', 
        'maximum_nights_avg_ntm', 
        'has_availability',
        #'availability_30', 
        #'availability_60', 
        #'availability_90',
        #'availability_365', 
        #'number_of_reviews', 
        #'number_of_reviews_ltm',
        'number_of_reviews_l30d', 
        #'review_scores_rating',
        'review_scores_accuracy', 
        'review_scores_cleanliness',
        'review_scores_checkin', 
        'review_scores_communication',
        'review_scores_location', 
        'review_scores_value', 
        #'instant_bookable',
        'time_to_first_rev', 
        'host_lifetime', 
        'days_since_first_review',
        'days_since_last_review', 
        #'has_overview', 
        #'has_host_about',
        #'has_reviews', 
        #'n_bathrooms', 
        #'latitude_rad', 
        #'longitude_rad',
        #'coord_x', 
        #'coord_y', 
        #'dist_to_sf', 
        #'dist_to_berkely', 
        #'dist_to_jose',
        #'dist_to_sc',
        #'has_amenities', 
        'n_amenities',
        #'host_in_CA'
        ]

    X = X.drop(PRE_SPLIT_DROP, axis=1)

    return X

# Post-split feature engineering

Perform feature engineering tasks that must be performed **separately** on train and test

In [1674]:
def post_split_engineering(X_train, X_test, y_train):
    TARGET_ENCODE_COLS = ['host_id','property_type']#, 'room_type'] # TODO: get room type to work as category!
    
    for col in TARGET_ENCODE_COLS:
        if col in X_train.columns:
            enc = TargetEncoder(smooth=0)
            X_train[col] = enc.fit_transform(np.array(X_train[col]).reshape(-1,1), y_train)
            X_test[col] = enc.transform(np.array(X_test[col]).reshape(-1,1))

    ## Normalize 5-star reviews to percentage using min-max scaling with 1 as min and 5 as max
    RATING_COLS = [col for col in X_train.columns if 'review_scores' in col]

    ## Combine and normalize review scores
    X_train['review_scores'] = X_train[RATING_COLS].mean(axis=1)
    X_test['review_scores'] = X_test[RATING_COLS].mean(axis=1)
    X_train['review_scores'] = (X_train['review_scores'] - 1) / (5 - 1)
    X_test['review_scores'] = (X_test['review_scores'] - 1) / (5 - 1)
    X_train['review_scores_rank'] = X_train['review_scores'].rank(pct=True)
    X_test['review_scores_rank'] = X_test['review_scores'].rank(pct=True)

    ## TODO: fix overflow errror from weighted score
    # X_train['weighted_score'] = X_train['review_scores'] * 10 * np.log(X_train['number_of_reviews'])
    # X_train['weighted_score_rank'] = X_train['weighted_score'].rank(pct=True)
    # X_test['weighted_score'] = X_test['review_scores'] * 10 * np.log(X_test['number_of_reviews'])
    # X_test['weighted_score_rank'] = X_test['weighted_score'].rank(pct=True)

    # X_train = X_train.drop(columns=['host_id'])
    # X_test = X_test.drop(columns=['host_id'])

    ## Cluster by location only
    latlon_clustering = KMeans(n_clusters=2, random_state=9) # Joe Burrow # 4 FOUR!!!
    train_latlon_cluster_labels = latlon_clustering.fit_predict(X_train[['coord_x', 'coord_y']])
    test_latlon_cluster_labels = latlon_clustering.fit_predict(X_test[['coord_x', 'coord_y']])

    X_train['location_cluster'] = train_latlon_cluster_labels
    X_test['location_cluster'] = test_latlon_cluster_labels

    ## Change cluster to int
    X_train['location_cluster'] = X_train['location_cluster'].astype(int)
    X_test['location_cluster'] = X_test['location_cluster'].astype(int)

    ## Target encode location clusters TODO: fix target encoding here
    # enc = TargetEncoder()
    # X_train['location_cluster_y'] = enc.fit_transform(np.array(X_train['location_cluster']).reshape(-1,1), y_train)
    # X_test['location_cluster_y'] = enc.transform(np.array(X_test['location_cluster']).reshape(-1,1))

    ## Define pipeline for numeric features
    numeric_pipe = Pipeline([ # NOTE: assumes preprocessor has already been run
        ('impute', IterativeImputer(random_state=9)), # Joe Burrow
        #('poly',PolynomialFeatures(interaction_only=True, include_bias=False)),#degree=2, include_bias=False)),
        #('standardize',StandardScaler(with_mean=True, with_std=True))
    ])

    ## Create col transformer for numeric features
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    cat_features = X_train.select_dtypes(exclude=['int64','float64']).columns

    numeric_transformer = ColumnTransformer([
        ('num', numeric_pipe, numeric_features)
    ])

    cols = X_train.columns

    X_train = numeric_transformer.fit_transform(X_train, y_train)
    X_test = numeric_transformer.transform(X_test)
 
    #X_train.loc[:,numeric_features] = numeric_transformer.fit_transform(X_train, y_train)
    #X_test.loc[:,numeric_features] = numeric_transformer.transform(X_test)

    ## Combine transformed numeric and categorical features
    #X_train = pd.concat([X_train_numeric.reset_index(), X_train[cat_features]], axis=1)
    #X_test = pd.concat([X_test_numeric.reset_index(), X_test[cat_features]], axis=1)

    #X_train = pd.conat([X_train[cat_features].reset_index(drop=True), X_train_numeric.reset_index(drop=True)], axis=1) #numeric_pipe.fit_transform(X_train, y_train)
    #X_test = pd.concat([X_test[cat_features].reset_index(drop=True), X_test_numeric.reset_index(drop=True)], axis=1) #numeric_pipe.transform(X_test)
    
    X_train = pd.DataFrame(X_train, columns=numeric_features)
    X_test = pd.DataFrame(X_test, columns=numeric_features)

    #X_train.loc[:,cat_features] = X_train[cat_features].astype('category')
    #X_test.loc[:,cat_features] = X_test[cat_features].astype('category')

    ## Extract linear model coefficients and feature names
    # X_names = numeric_transformer.named_steps_['num'].named_steps['poly'].get_feature_names_out() # X0, X1, X2, X2^2, X1*X2, etc.
    # x_mapping = {f'x{i}':name for i,name in enumerate(cols)} # X0=Salnty, etc.

    # ## Turn x names into actual feature names with x_mapping
    # actual_names = X_names.copy()
    # for x in X_names:
    #     for key in x_mapping:
    #         if key in x:
    #             actual_names = [re.sub(key,x_mapping[key],x) for x in actual_names]

    ## Cluster by everything
    clustering = KMeans(n_clusters=3, random_state=9) # Joe Burrow
    train_cluster_labels = clustering.fit_predict(X_train)#[numeric_features])
    test_cluster_labels = clustering.fit_predict(X_test)#[numeric_features])

    ## Add column to X_train and X_test
    X_train['cluster'] = train_cluster_labels
    X_test['cluster'] = test_cluster_labels

    ## Target encode cluster TODO: fix target encoding here
    # enc2 = TargetEncoder()
    # X_train['cluster_y'] = enc2.fit_transform(np.array(X_train['cluster']).reshape(-1,1), y_train)
    # X_test['cluster_y'] = enc2.transform(np.array(X_test['cluster']).reshape(-1,1))
    
    return X_train, X_test, y_train

# Run pipeline for testing/iteration

Perform train/test split on known data to evaluate feature engineering, perform variable selection, etc.

In [1675]:
X_engineered = pre_split_engineering(X_orig) # Perform pre-split engineering
X_engineered_cleaned = pre_split_drop(X_engineered) # Drop columns that are not needed

In [1676]:
## Perform train-test split and post-split engineering
X_train_, X_test_, y_train_, y_test = train_test_split(X_engineered_cleaned, y_orig, test_size=0.2, random_state=9) # Joe Burrow
X_train, X_test, y_train = post_split_engineering(X_train_, X_test_, y_train_)



In [1677]:
## Change object columns to category
cat_cols = X_train.select_dtypes(include='object').columns
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [1678]:
# compare_index = X_test.index

# ## Train a model for each cluster
# models = {f'cluster_{i}': LGBMRegressor(num_iterations=600, learning_rate=0.1, objective='huber', random_state=9) for i in range(len(pd.Series(train_latlon_cluster_labels).unique()))}
# preds = {f'cluster_{i}': None for i in range(len(pd.Series(train_latlon_cluster_labels).unique()))}

# for cluster in pd.Series(train_latlon_cluster_labels).unique():
#     print(f'Training model for cluster {cluster}...')

#     X_train_filtered = X_train.loc[train_latlon_cluster_labels == cluster, X_train.select_dtypes(include=['int64','float64']).columns]
#     y_train_filtered = y_train[train_latlon_cluster_labels == cluster]

#     X_test_filtered = X_test.loc[test_latlon_cluster_labels == cluster, X_train.select_dtypes(include=['int64','float64']).columns]
#     y_test_filtered = y_test[test_latlon_cluster_labels == cluster]

#     ## Drop location cluster
#     X_train_filtered = X_train_filtered.drop('location_cluster', axis=1)
#     X_test_filtered = X_test_filtered.drop('location_cluster', axis=1)

#     models[f'cluster_{cluster}'].fit(X_train_filtered, np.log(y_train_filtered))
#     print(f'Model for cluster {cluster} trained.')

#     ## Predict on test set
#     preds[f'cluster_{cluster}'] = pd.Series(np.exp(models[f'cluster_{cluster}'].predict(X_test_filtered)), index=X_test_filtered.index)

# ## Concat predictions and sort by index
# all_y = pd.concat([preds[f'cluster_{cluster}'] for cluster in range(len(preds)) if cluster in pd.Series(train_latlon_cluster_labels).unique()])#.sort_index(compare_index)

# ## Sort all y by order of compare_index
# y_pred_testing = all_y.reindex(compare_index)

# mean_absolute_error(y_test, y_pred_testing) 

In [1679]:
## Drop cluster 2
keep_index = (X_train['cluster'] != 1).to_numpy()
X_train = X_train.loc[keep_index,:]
y_train = y_train[keep_index]

In [1680]:
def train_predict_partition(X_train, X_test, y_train, col, cutoff, score=False):
    compare_index = X_test.index
    is_small_company_train = (X_train[col] > cutoff).to_numpy()
    is_small_company_test = (X_test[col] > cutoff).to_numpy()

    small_company_model = LGBMRegressor(num_iterations=600, learning_rate=0.1, objective='huber', random_state=9)
    large_company_model = LGBMRegressor(num_iterations=600, learning_rate=0.1, objective='huber', random_state=9)   

    small_company_model.fit(X_train[is_small_company_train], np.log(y_train[is_small_company_train]))
    large_company_model.fit(X_train[~is_small_company_train], np.log(y_train[~is_small_company_train]))

    small_y_pred = pd.Series(np.exp(small_company_model.predict(X_test[is_small_company_test])), index=X_test[is_small_company_test].index)
    large_y_pred = pd.Series(np.exp(large_company_model.predict(X_test[~is_small_company_test])), index=X_test[~is_small_company_test].index)

    y_pred_testing = pd.concat([small_y_pred, large_y_pred]).reindex(compare_index)
    
    if score:
        print(f'MAE: {mean_absolute_error(y_test, y_pred_testing)}')

    return y_pred_testing

In [1681]:
#train_predict_partition(X_train, X_test, 'latitude_rad', np.radians(37.8253), score=True)
train_predict_partition(X_train, X_test, y_train, col='host_listings_count', cutoff=200, score=True)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3278
[LightGBM] [Info] Number of data points in the train set: 1049, number of used features: 34
[LightGBM] [Info] Start training from score 5.305337




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4882
[LightGBM] [Info] Number of data points in the train set: 11062, number of used features: 37
[LightGBM] [Info] Start training from score 4.955617
MAE: 55.60505493115693


0       123.539980
1        67.821683
2        92.214198
3       156.782852
4        90.396725
           ...    
3030     71.520873
3031    213.496844
3032    105.753710
3033    105.871984
3034    202.886971
Length: 3035, dtype: float64

In [1682]:
testing_model = LGBMRegressor(num_iterations=625, learning_rate=0.1, objective='huber', random_state=9)#, categorical_feature=f'name:{X_train.select_dtypes("category").columns}') # Joe Burrow
testing_model.fit(X_train[X_train.select_dtypes(exclude=['category']).columns], np.log(y_train)) #[X_train.select_dtypes(include=['int64','float64']).columns]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4933
[LightGBM] [Info] Number of data points in the train set: 12111, number of used features: 37
[LightGBM] [Info] Start training from score 4.985908




In [1683]:
y_pred_testing = np.exp(testing_model.predict(X_test[X_train.select_dtypes(exclude=['category']).columns]))
mean_absolute_error(y_test, y_pred_testing) 
# for reference, the score of ~100 on Kaggle got ~63.46 here. The score of 91.5 on Kaggle got a 57.16 here.

54.96112439383056

# Run pipeline to predict for Kaggle

In [1684]:
## Read in and preview data
data = pd.read_csv('../data/train.csv')
newdata = pd.read_csv('../data/test.csv')

data = data.drop(columns=['Id'])
new_data_id = newdata['Id']
newdata = newdata.drop(columns=['Id'])

data['host_id'] = data['host_id'].astype(str)
newdata['host_id'] = newdata['host_id'].astype(str)

data = data.drop(DROP_COLS, axis=1)
newdata = newdata.drop(DROP_COLS, axis=1)

## Split data into features and target
X = data.drop('price', axis=1)
y = data['price']

In [1685]:
X_1 = pre_split_engineering(X) # All data with known y
newdata_1 = pre_split_engineering(newdata) # New data for Kaggle

X_2  = pre_split_drop(X_1) # All data with known y
newdata_2 = pre_split_drop(newdata_1) # New data for Kaggle

In [1686]:
X, newdata_final, y = post_split_engineering(X_train=X_2, X_test=newdata_2, y_train=y)



If predicting off partition...

In [1687]:
y_pred = train_predict_partition(X_train=X, X_test=newdata_final, y_train=y, col='host_listings_count', cutoff=200)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3319
[LightGBM] [Info] Number of data points in the train set: 1341, number of used features: 34
[LightGBM] [Info] Start training from score 5.368042




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4938
[LightGBM] [Info] Number of data points in the train set: 13933, number of used features: 38
[LightGBM] [Info] Start training from score 4.984020


If **tuning hyperparams** from scratch... Otherwise, use predetermined hyperparams. Tune time is ~22 mins on 4 threads.

LightGBM hyperparameters outlined at https://lightgbm.readthedocs.io/en/latest/Parameters.html

In [None]:
# Define model to tune; set `num_threads` or `device` to train faster
tuned_model = LGBMRegressor(num_threads=4, objective='huber', random_state=9) # Joe Burrow # device='gpu'

## Tune hyperparameters with random search
# param_distributions = {
#     'num_iterations': [300,400,500], #range(75, 501, 25),
#     'learning_rate': [0.01, 0.05, 0.1],
#     #'max_depth': [2,3,4,5,6],    #'num_leaves': range(20, 61, 5),
#     #'n_neighbors': range(1,101),
#     #'weights': ['uniform', 'distance'],
# }

fine_tune_grid = {
    'num_iterations': [500,550,600], #range(75, 501, 25),
    'learning_rate': [0.75, 0.1, 0.125],
}

search = RandomizedSearchCV(tuned_model, fine_tune_grid,  verbose=False,
                            n_iter=100, cv=30,  # 30-fold cross-validation seems to be a good balance between time and accuracy
                            scoring='neg_mean_absolute_error', random_state=9) # Joe Burrow

search.fit(X, np.log(y))
print(search.best_params_)

y_pred = np.exp(search.predict(newdata_final))

In [None]:
search.best_params_

If predicting with **known/predetermined hyperparams**...

In [1690]:
## Create model with best hyperparams (last tuned on 2/20 around 4:45 PM)
tuned_model = LGBMRegressor(num_iterations=625, learning_rate=0.1, random_state=9, objective='huber') # Joe Burrow
tuned_model.fit(X, np.log(y))
print('Model fit. Predicting...')
y_pred = np.exp(tuned_model.predict(newdata_final))



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4991
[LightGBM] [Info] Number of data points in the train set: 15274, number of used features: 38
[LightGBM] [Info] Start training from score 5.017736
Model fit. Predicting...


Write out predictions for Kaggle

In [1691]:
## Output predictions to csv
output = pd.DataFrame({'Id': new_data_id, 'price': y_pred})
output.head()

Unnamed: 0,Id,price
0,PSJEN,70.638006
1,PVZW7,164.253129
2,EJLAM,50.724875
3,SDHPB,58.129009
4,MJGYX,289.844416


In [1692]:
output.to_csv('../data/submission.csv', index=False)