In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load dataset
file_path = "Airbnb_Data.xlsx"  # Update with your file path
df = pd.read_excel(file_path)

# Clean column names
df.columns = [col.strip().replace(" ", "_") for col in df.columns]

# Select features and target
target = 'Actual_Price'
features = ['property_type', 'room_type', 'accommodates', 'bathrooms',
            'bed_type', 'cleaning_fee', 'city', 'number_of_reviews',
            'review_scores_rating', 'bedrooms', 'beds']

# Drop rows with missing values for selected columns
df = df[features + [target]].dropna()

X = df[features]
y = df[target]

# Categorical & numerical columns
cat_cols = ['property_type', 'room_type', 'bed_type', 'city']
num_cols = ['accommodates', 'bathrooms', 'cleaning_fee',
            'number_of_reviews', 'review_scores_rating', 'bedrooms', 'beds']

# Preprocessor: OneHotEncode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")


RMSE: 98.23040976384692
R² Score: 0.48485074622102986
