In [22]:
import pandas as pd

In [23]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Uttarahalli,1440.0,2.0,3.0,62.0,3
2,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
3,Kothanur,1200.0,2.0,1.0,51.0,2
4,Whitefield,1170.0,2.0,1.0,38.0,2


In [24]:
df.isna().any()

location      False
total_sqft    False
bath          False
balcony       False
price         False
BHK           False
dtype: bool

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
import pandas as pd

from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold

models = {
    "Random Forest": RandomForestRegressor(
        min_samples_leaf=5, random_state=0
    ),
    "Hist Gradient Boosting": HistGradientBoostingRegressor(
        max_leaf_nodes=15, random_state=0, early_stopping=False
    ),
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100]},
    "Hist Gradient Boosting": {"max_iter": [10, 20, 50, 100, 300, 500]},
}
cv = KFold(n_splits=4, shuffle=True, random_state=0)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
    results.append(result)

In [26]:
from sklearn.compose import ColumnTransformer
# Assuming 'location' is a categorical feature
categorical_features = ['location']

# Create a ColumnTransformer to apply OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

In [30]:
# Create the pipeline
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LinearRegression(fit_intercept=True,  copy_X=True, n_jobs=1))
])

In [31]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['price']),df['price'],random_state=10,test_size=.2)

In [32]:
clf.fit(X_train,y_train)

In [33]:
clf.predict(X_test)

array([ 82.72600586, 121.21960241,  57.93036569, ..., 167.12968557,
        73.74690285,  40.57556325])

In [34]:
clf.score(X_test,y_test)

0.7829849447733207