In [None]:
!pip install shap

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('default')

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as R2
from scipy.special import softmax
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel

import shap

train = pd.read_csv('/content/combine_train.csv'
                    , low_memory=False, index_col='property_id')
train.dropna(subset=['target'], how='any', inplace=True)

test = pd.read_csv('/content/combine_test.csv'
                    , low_memory=False, index_col='property_id')

train = train.rename(columns={"translationmissing:en.hosting_amenity_49": "hosting_amenity_49", 
                                              "translationmissing:en.hosting_amenity_50": "hosting_amenity_50",
                                              "[24-hourcheck-in": "24-hourcheck-in",
                                              "wirelessinternet]": "wirelessinternet"})
test = test.rename(columns={"translationmissing:en.hosting_amenity_49": "hosting_amenity_49", 
                                            "translationmissing:en.hosting_amenity_50": "hosting_amenity_50",
                                            "[24-hourcheck-in": "24-hourcheck-in",
                                            "wirelessinternet]": "wirelessinternet"})

In [None]:
test.head()

In [None]:
def print_feature_importances_random_forest(random_forest_model):
    
    '''
    Prints the feature importances of a Random Forest model in an ordered way.
    random_forest_model -> The sklearn.ensemble.RandomForestRegressor or RandomForestClassifier trained model
    '''
    
    # Fetch the feature importances and feature names
    importances = random_forest_model.feature_importances_
    features = random_forest_model.feature_names_in_
    
    # Organize them in a dictionary
    feature_importances = {fea: imp for imp, fea in zip(importances, features)}
    
    # Sorts the dictionary
    feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1], reverse = True)}
    
    # Prints the feature importances
    for k, v in feature_importances.items():
        print(f"{k} -> {v:.4f}")

def print_feature_importances_shap_values(shap_values, features):
    '''
    Prints the feature importances based on SHAP values in an ordered way
    shap_values -> The SHAP values calculated from a shap.Explainer object
    features -> The name of the features, on the order presented to the explainer
    '''

    # Calculates the feature importance (mean absolute shap value) for each feature
    importances = []
    for i in range(shap_values.values.shape[1]):
        importances.append(np.mean(np.abs(shap_values.values[:, i])))

    # Calculates the normalized version
    importances_norm = softmax(importances)

    # Organize the importances and columns in a dictionary
    feature_importances = {fea: imp for imp, fea in zip(importances, features)}
    feature_importances_norm = {fea: imp for imp, fea in zip(importances_norm, features)}

    # Sorts the dictionary
    feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1], reverse=True)}
    feature_importances_norm = {k: v for k, v in
                                sorted(feature_importances_norm.items(), key=lambda item: item[1], reverse=True)}

    # Prints the feature importances
    for k, v in feature_importances.items():
        print(f"{k} -> {v:.4f} (softmax = {feature_importances_norm[k]:.4f})")

def evaluate_regression(y, y_pred):
    '''
    Prints the most common evaluation metrics for regression
    '''

    mae = MAE(y, y_pred)
    mse = MSE(y, y_pred)
    rmse = mse ** (1 / 2)
    r2 = R2(y, y_pred)

    print('Regression result')
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2: {r2:.2f}")

In [None]:
X = train.drop('target', axis=1)
X = X.drop('reviews_last', axis=1)
y = train['target']

test = test.drop('reviews_last', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [None]:
model = RandomForestRegressor(max_depth=80, min_samples_leaf=5, min_samples_split=12,
                      n_estimators=1000, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
evaluate_regression(y_test, y_pred)
print_feature_importances_random_forest(model)

Regression result
MAE: 30.86
MSE: 2544.10
RMSE: 50.44
R2: -0.02
property_scraped_at_host_since -> 0.0928
centroid_distance -> 0.0792
reviews_per_month -> 0.0676
booking_availability_365 -> 0.0530
property_last_updated -> 0.0478
reviews_num -> 0.0441
reviews_rating -> 0.0365
booking_availability_90 -> 0.0315
booking_availability_30 -> 0.0238
booking_availability_60 -> 0.0233
booking_max_nights -> 0.0226
booking_min_nights -> 0.0211
host_response_rate -> 0.0167
reviews_location -> 0.0133
booking_cancel_policy -> 0.0129
responsetime_code -> 0.0128
property_max_guests -> 0.0128
property_type -> 0.0122
property_bedrooms -> 0.0121
extra_beds -> 0.0121
reviews_acc -> 0.0118
reviews_cleanliness -> 0.0117
reviews_value -> 0.0116
host_nr_listings_total -> 0.0103
host_nr_listings -> 0.0099
property_beds -> 0.0093
Is Location Exact -> 0.0093
tv -> 0.0090
smokingallowed -> 0.0083
Host Identity Verified -> 0.0081
elevatorinbuilding -> 0.0076
booking_price_covers -> 0.0075
reviews_checkin -> 0.0075
i

In [None]:
# explainer = shap.Explainer(model.predict, X_test)
# shap_values = explainer(X_test)

In [None]:
target_pred = model.predict(test)
test.reset_index(inplace=True)

predictions = pd.DataFrame({'property_id': test['property_id'], 'prediction': target_pred})
predictions.to_csv('predictions.csv', index=False)

In [None]:
predictions.head()

Unnamed: 0,property_id,prediction
0,6501,62.784525
1,6502,73.293461
2,6503,74.860286
3,6504,80.761293
4,6505,72.485114
