In [17]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
import numpy as np

# Load the data
data = pd.read_csv('combine_train.csv')

# Handle missing values
data.fillna(0, inplace=True)  # Replace missing values with zeros

# Remove rows with invalid values
data = data.replace([np.inf, -np.inf], np.nan).dropna()
# Preprocess feature names
data.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') for col in data.columns]
# Separate features and target variable
X = data.drop('target', axis=1)
y = data['target']

# Define the number of folds for cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

rmse_scores = []
mae_scores = []



In [13]:
data

Unnamed: 0,property_id,Dutch,English,French,Germany,property_type,property_room_type,property_max_guests,property_bathrooms,property_bedrooms,...,None,target,booking_availability_30,booking_availability_60,booking_availability_90,booking_availability_365,reviews_num,reviews_last,reviews_rating,booking_cancel_policy
0,1,False,False,True,False,0,1,1,1.0,1,...,False,33.0,7,37,67,96,23,2,97.000000,3
1,2,False,False,True,False,14,0,4,1.5,2,...,False,26.0,0,0,25,25,1,15,100.000000,3
2,3,True,False,False,False,8,0,2,1.0,1,...,False,75.0,20,34,55,330,4,5,93.000000,3
3,4,False,True,False,False,14,1,2,1.0,1,...,False,110.0,21,46,64,339,109,4,98.000000,3
4,5,False,True,False,False,14,1,2,1.0,1,...,False,60.0,23,50,80,355,61,14,97.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6490,6496,False,False,True,False,0,1,2,1.0,1,...,False,29.0,8,32,62,140,4,36,100.000000,3
6491,6497,False,False,True,False,14,1,2,1.0,1,...,False,30.0,6,36,61,249,0,17294,87.333333,3
6492,6498,False,False,True,False,0,1,2,1.0,1,...,False,58.0,0,0,0,0,0,17298,98.666667,2
6493,6499,False,True,False,False,14,1,4,1.0,1,...,False,52.0,0,0,0,0,9,211,100.000000,3


In [18]:
# Perform k-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Train the XGBoost model
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate RMSE and MAE on the validation set
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mae = mean_absolute_error(y_val, y_pred)
    
    rmse_scores.append(rmse)
    mae_scores.append(mae)

# Calculate average RMSE and MAE across all folds
avg_rmse = sum(rmse_scores) / k
avg_mae = sum(mae_scores) / k

print("Average RMSE:", avg_rmse)
print("Average MAE:", avg_mae)




Average RMSE: 55.59443667853368
Average MAE: 34.6442173510721


In [33]:
# Load the test data
test_data = pd.read_csv('combine_test.csv')
# Handle missing values in the test data
test_data.fillna(0, inplace=True)  # Replace missing values with zeros
# Convert "reviews_last" column to float
test_data['reviews_last'][0]='17294'
test_data['reviews_last'] = test_data['reviews_last'].astype(float)

test_data.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') for col in test_data.columns]
model = xgb.XGBRegressor()
model.fit(X, y)
# Predict on the test set
test_pred = model.predict(test_data)

# Print the predicted results
print("Predicted results for the test set:")
print(test_pred)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['reviews_last'][0]='17294'


Predicted results for the test set:
[ 82.065125  64.771126  84.75634   61.38403   67.500916  72.57406
  64.16381  129.08899   75.11385   51.003296  43.965496  76.429756
  59.547066  63.934284  63.999813  85.01979   40.854187  81.99744
  74.28145   80.704155 180.75227   75.14662   69.99477   52.91095
  92.04975   70.151726  58.45698   72.539955  56.858334  63.95432
 103.93684   73.62109   79.53566   59.38865   57.565304  63.520763
  46.196426  91.20096   40.181484  73.13452   87.25213   70.18882
  44.502876  74.469025  73.801056  62.057587  75.69484   45.942184
  56.106205  78.0451    85.485435  34.153893  93.3772   113.455574
  99.41835   50.39006   65.43707  109.39628   69.749825  83.9232
  43.91451   54.338974  83.631546 148.14047   64.44886   83.65338
  54.50225   59.866703  95.49886   96.987175  97.24482   59.7
  79.69051  111.34549   90.92634  101.44404   87.55576   84.711975
 105.06285  102.3778    86.48231  103.07244   97.19678  200.69978
 116.71546   88.60077   89.40096  109.81

In [39]:
predict=pd.DataFrame()
predict['ID']=test_data['property_id']
predict['PRE']=test_pred
predict.head(3)

Unnamed: 0,ID,PRE
0,6501,82.065125
1,6502,64.771126
2,6503,84.75634


In [40]:
predict.to_csv('predict.csv', index=False)