In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings


In [2]:
! pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Load the dataset
df = pd.read_csv("/content/drive/My Drive/holiday_rental.csv")

In [4]:
df['last_review'] = pd.to_datetime(df['last_review'])
latest_date = df['last_review'].max()
df['last_review'] = df['last_review'].fillna(latest_date)
df['days_since_last_review'] = (latest_date - df['last_review']).dt.days
df.drop('last_review', axis=1, inplace=True)
df['reviews_per_month'].fillna(df['reviews_per_month'].mean(), inplace=True)

To check the correlation

In [5]:
corr = df.corr()
corr

Unnamed: 0,id,host_id,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,days_since_last_review
id,1.0,0.58829,0.010619,-0.013224,-0.31976,0.253252,0.133272,0.085468,-0.387789
host_id,0.58829,1.0,0.015309,-0.017364,-0.140106,0.255046,0.15495,0.203492,-0.24046
price,0.010619,0.015309,1.0,0.042799,-0.047954,-0.022373,0.057472,0.081829,-0.025097
minimum_nights,-0.013224,-0.017364,0.042799,1.0,-0.080116,-0.091942,0.12796,0.144303,-0.015167
number_of_reviews,-0.31976,-0.140106,-0.047954,-0.080116,1.0,0.530093,-0.072376,0.172028,-0.122204
reviews_per_month,0.253252,0.255046,-0.022373,-0.091942,0.530093,1.0,-0.006701,0.16298,-0.375741
calculated_host_listings_count,0.133272,0.15495,0.057472,0.12796,-0.072376,-0.006701,1.0,0.225701,-0.086606
availability_365,0.085468,0.203492,0.081829,0.144303,0.172028,0.16298,0.225701,1.0,-0.233215
days_since_last_review,-0.387789,-0.24046,-0.025097,-0.015167,-0.122204,-0.375741,-0.086606,-0.233215,1.0


two columns id and host_id are correlated as compare to other columns.so id columns can be removed.

In [6]:
X = df.drop(columns=['price','id'],axis=1)
y = df['price']


In [7]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [8]:
# feature transformation
X = preprocessor.fit_transform(X)


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((39116, 236), (9779, 236))

In [None]:
model = RandomForestRegressor()
model.fit(X,y)

model = RandomForestRegressor()
# Train a random forest regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Get feature importances
importances = rf_model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [X.columns[i] for i in indices]

# Print the feature ranking
print("Feature ranking:")
for i in range(X.shape[1]):
    print(f"{i+1}. {names[i]} ({importances[indices[i]]})")

# Plot feature importances
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), names, rotation=90)
plt.show()

Model training and Evalution

In [10]:

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
rmse_list = []
mae_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 232.6730
- Mean Absolute Error: 72.4940
- R2 Score: 0.1129
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 195.5849
- Mean Absolute Error: 68.0458
- R2 Score: 0.1353


Lasso
Model performance for Training set
- Root Mean Squared Error: 235.3121
- Mean Absolute Error: 73.5811
- R2 Score: 0.0927
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 196.8902
- Mean Absolute Error: 68.8533
- R2 Score: 0.1237


Ridge
Model performance for Training set
- Root Mean Squared Error: 232.6843
- Mean Absolute Error: 72.4864
- R2 Score: 0.1128
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 195.4321
- Mean Absolute Error: 67.9529
- R2 Score: 0.1366


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 12.8167
- Mean Absolute Error: 0.4551
- R2 Score: 0.9973
------------

From above the result, the r2 score is very low .The performance of the models is also  good. In most of the  model, overftting occurs.The outliers has not removed from the data In test step we will remove the outliers and check the result.

Results before removing the outliers

In [12]:
df1 = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

In [13]:
df1['outlier_status'] = 'Not Removed'
df1

Unnamed: 0,Model Name,R2_Score,outlier_status
6,CatBoosting Regressor,0.164796,Not Removed
2,Ridge,0.136636,Not Removed
0,Linear Regression,0.135286,Not Removed
1,Lasso,0.123705,Not Removed
4,Random Forest Regressor,0.103702,Not Removed
5,XGBRegressor,0.07201,Not Removed
3,Decision Tree,-0.597475,Not Removed
7,AdaBoost Regressor,-0.670707,Not Removed


To remove the outliers

In [14]:
# calculate the IQR for each column
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# identify any rows with outliers
outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

# remove the rows with outliers
df = df[~outliers.any(axis=1)]

  outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))


Model retrainin and Evaluation after removing the outliers

In [37]:
X= df.drop(columns=['price'],axis=1)
y = df['price']
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)
X = preprocessor.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((18542, 232), (4636, 232))

In [16]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [17]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 47.0467
- Mean Absolute Error: 34.7948
- R2 Score: 0.5151
Model performance for Test set
- Root Mean Squared Error: 47.2907
- Mean Absolute Error: 35.2711
- R2 Score: 0.5035


Lasso
Model performance for Training set
- Root Mean Squared Error: 50.3997
- Mean Absolute Error: 37.7328
- R2 Score: 0.4436
Model performance for Test set
- Root Mean Squared Error: 50.3057
- Mean Absolute Error: 37.8877
- R2 Score: 0.4382


Ridge
Model performance for Training set
- Root Mean Squared Error: 47.0661
- Mean Absolute Error: 34.8106
- R2 Score: 0.5147
Model performance for Test set
- Root Mean Squared Error: 47.1919
- Mean Absolute Error: 35.2179
- R2 Score: 0.5056


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
Model performance for Test set
- Root Mean Squared Error: 67.2792
- Mean Absolute Error: 48.7435
- R2 Score: -0.0048


Rando

In [18]:
df2  = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
df2

Unnamed: 0,Model Name,R2_Score
6,CatBoosting Regressor,0.526137
5,XGBRegressor,0.508542
2,Ridge,0.505606
0,Linear Regression,0.503534
4,Random Forest Regressor,0.480233
1,Lasso,0.438213
7,AdaBoost Regressor,0.29107
3,Decision Tree,-0.004844


In [19]:
df2['outlier_status'] = 'after removing outliers'
df2

Unnamed: 0,Model Name,R2_Score,outlier_status
6,CatBoosting Regressor,0.526137,after removing outliers
5,XGBRegressor,0.508542,after removing outliers
2,Ridge,0.505606,after removing outliers
0,Linear Regression,0.503534,after removing outliers
4,Random Forest Regressor,0.480233,after removing outliers
1,Lasso,0.438213,after removing outliers
7,AdaBoost Regressor,0.29107,after removing outliers
3,Decision Tree,-0.004844,after removing outliers


After removal the outliers , r2 score increases as compare to previous case . Even the model perfomance is not good. Among all the models, CatBoosting Regressor has performned the best model.

In [20]:
df_combined = pd.concat([df1, df2])

# Print the combined dataframe
df_combined

Unnamed: 0,Model Name,R2_Score,outlier_status
6,CatBoosting Regressor,0.164796,Not Removed
2,Ridge,0.136636,Not Removed
0,Linear Regression,0.135286,Not Removed
1,Lasso,0.123705,Not Removed
4,Random Forest Regressor,0.103702,Not Removed
5,XGBRegressor,0.07201,Not Removed
3,Decision Tree,-0.597475,Not Removed
7,AdaBoost Regressor,-0.670707,Not Removed
6,CatBoosting Regressor,0.526137,after removing outliers
5,XGBRegressor,0.508542,after removing outliers


The performance of model is very  good after the removal the outliers.

Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
def evaluate_models(X_train, y_train, X_test, y_test, models, param):

  report = {}

  for i in range(len(list(models))):
      model = list(models.values())[i]
      para = param[list(models.keys())[i]]


      gs = GridSearchCV(model,para,cv=5)
      gs.fit(X_train,y_train)

      model.set_params(**gs.best_params_)
      model.fit(X_train,y_train)

      y_train_pred = model.predict(X_train)
      y_test_pred = model.predict(X_test)

      train_model_score = r2_score(y_train,y_train_pred)
      test_model_score = r2_score(y_test,y_test_pred)

      report[list(models.keys())[i]] = test_model_score

  return report

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

models = {
            "Random Forest": RandomForestRegressor(),
            "Decision Tree": DecisionTreeRegressor(),
            "Gradient Boosting": GradientBoostingRegressor(),
            "Linear Regression": LinearRegression(),
            "XGBRegressor": XGBRegressor(),
            "CatBoosting Regressor": CatBoostRegressor(verbose=False),
            "AdaBoost Regressor": AdaBoostRegressor(),
            }
params={
    "Decision Tree": {
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'splitter':['best','random'],
        # 'max_features':['sqrt','log2'],
    },
    "Random Forest":{
        # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
      
        # 'max_features':['sqrt','log2',None],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Gradient Boosting":{
        # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate':[.1,.01,.05,.001],
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        # 'criterion':['squared_error', 'friedman_mse'],
        # 'max_features':['auto','sqrt','log2'],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Linear Regression":{},
    "XGBRegressor":{
        'learning_rate':[.1,.01,.05,.001],
        'n_estimators': [8,16,32,64,128,256]
    },
    "CatBoosting Regressor":{
        'depth': [6,8,10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor":{
        'learning_rate':[.1,.01,0.5,.001],
        # 'loss':['linear','square','exponential'],
        'n_estimators': [8,16,32,64,128,256]
    }
    
}

model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
                                  models=models,param=params)

## To get best model score from dict
best_model_score = max(sorted(model_report.values()))

## To get best model name from dict

best_model_name = list(model_report.keys())[
    list(model_report.values()).index(best_model_score)
]
best_model = models[best_model_name]
predicted=best_model.predict(X_test)
r2_square = r2_score(y_test, predicted)

Finding most important features using randomforest regressor model

In [None]:

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on your data
rf_model.fit(X, y)

# Get the feature importances
importances = pd.Series(rf_model.feature_importances_, index=X.columns)

# Sort the features by importance
sorted_importances = importances.sort_values(ascending=False)

# Print the top 10 features by importance
print(sorted_importances.head(10))

In [42]:
# Create a dataframe with the feature importances
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort the features by importance
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

# Print the top 10 features by importance
print(feature_importances.head(10))

AttributeError: ignored