In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Uttarahalli,1440.0,2.0,3.0,62.0,3
2,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
3,Kothanur,1200.0,2.0,1.0,51.0,2
4,Whitefield,1170.0,2.0,1.0,38.0,2


In [3]:
X = df.copy()
X = X.drop(columns=['price'])
y = df.price

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_validate

In [5]:
categorical_features = ['location']

# Create a ColumnTransformer to apply OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)


In [6]:
X

Unnamed: 0,location,total_sqft,bath,balcony,BHK
0,Electronic City Phase II,1056.0,2.0,1.0,2
1,Uttarahalli,1440.0,2.0,3.0,3
2,Lingadheeranahalli,1521.0,3.0,1.0,3
3,Kothanur,1200.0,2.0,1.0,2
4,Whitefield,1170.0,2.0,1.0,2
...,...,...,...,...,...
10693,Green Glen Layout,1715.0,3.0,3.0,3
10694,Whitefield,3453.0,4.0,0.0,5
10695,Raja Rajeshwari Nagar,1141.0,2.0,1.0,2
10696,Padmanabhanagar,4689.0,4.0,1.0,4


In [7]:
X_trans = pd.DataFrame(preprocessor.fit_transform(X).toarray())
X_trans


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1056.0,2.0,1.0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1440.0,2.0,3.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1521.0,3.0,1.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1200.0,2.0,1.0,2.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1170.0,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1715.0,3.0,3.0,3.0
10694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3453.0,4.0,0.0,5.0
10695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1141.0,2.0,1.0,2.0
10696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4689.0,4.0,1.0,4.0


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_trans,y,random_state=10,test_size=.25)

In [9]:
models = {
    "Random Forest": RandomForestRegressor(
        min_samples_leaf=5, random_state=0
    ),
    "Linear Regression":LinearRegression(),
    "Decision Tree": DecisionTreeRegressor()
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100]},
    "Linear Regression": {"fit_intercept": [True,False],
                         "positive":[True,False]
                         },
    "Decision Tree":{"criterion":["squared_error","friedman_mse","absolute_error"],
                     "splitter":['best','random']
    }
}
results = []

for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=4
    ).fit(X_train, y_train)
    
    # Alternatively, you can use cross_validate for more control
    cv_results = cross_validate(model, X_train, y_train, cv=4, return_train_score=True)
    
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
    results.append(result)

In [10]:
for result in results:
    print(result["model"])
    print(result["cv_results"]["mean_train_score"])
    print("-----------------------")


Random Forest
0    0.802144
1    0.806541
2    0.806446
3    0.807920
Name: mean_train_score, dtype: float64
-----------------------
Linear Regression
0    0.837798
1    0.839188
2    0.799321
3    0.839188
Name: mean_train_score, dtype: float64
-----------------------
Decision Tree
0    0.991102
1    0.991102
2    0.991102
3    0.991102
4    0.990315
5    0.990315
Name: mean_train_score, dtype: float64
-----------------------
