In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


In [4]:
df= pd.read_csv('../data/raw/airbnb.csv',encoding='latin1',low_memory=False)


In [7]:
df.head()
df.shape


(279712, 33)

In [8]:
y = df["price"]


In [13]:
features = [
    "accommodates",
    "bedrooms",
    "room_type",
    "review_scores_rating"
]

X = df[features]


We intentionally avoid many features to reduce noise and complexity.


In [14]:
X.isnull().sum()


accommodates                0
bedrooms                29435
room_type                   0
review_scores_rating    91405
dtype: int64

In [16]:
# Adding .copy() makes X an independent DataFrame
X = df[features].copy()

# Now these will run perfectly without any red warning boxes
X["bedrooms"] = X["bedrooms"].fillna(X["bedrooms"].median())
X["review_scores_rating"] = X["review_scores_rating"].fillna(X["review_scores_rating"].median())


In [17]:
X["room_type"]=X["room_type"].fillna(X["room_type"].mode()[0])

In [20]:
X_encoded = pd.get_dummies(X, columns=["room_type"], drop_first=True,dtype=int)


One-hot encoding converts categories into binary columns.

In [21]:
X_encoded.head()


Unnamed: 0,accommodates,bedrooms,review_scores_rating,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,2,1.0,100.0,0,0,0
1,2,1.0,100.0,0,0,0
2,2,1.0,100.0,0,0,0
3,2,1.0,100.0,0,0,0
4,2,1.0,100.0,0,0,0


In [25]:
# Returns the total count of missing values in the entire DataFrame
print(f"Total missing values: {X_encoded.isnull().sum().sum()}")

# To see a breakdown by column just to be safe:
print(X_encoded.isnull().sum())


Total missing values: 0
accommodates              0
bedrooms                  0
review_scores_rating      0
room_type_Hotel room      0
room_type_Private room    0
room_type_Shared room     0
dtype: int64


In [26]:
# Check the shape (Rows, Columns)
print(f"Dataset Shape: {X_encoded.shape}")


Dataset Shape: (279712, 6)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42
)


We train on 80% of data and test on unseen 20%.