In [1]:
import pandas as pd
path_of_data = r"search-ms:displayname=Search%20Results%20in%20Desktop&crumb=location:C%3A%5CUsers%5Cgauta%5COneDrive%5CDesktop\Practical Ml\housing.csv"
data = pd.read_csv("housing.csv")
data_frame = pd.DataFrame(data)

In [2]:
from sklearn.model_selection import train_test_split
train_ , test_ = train_test_split(data_frame , test_size=0.2, random_state=42)
X = train_.drop('median_house_value' , axis = 1)
Y = train_["median_house_value"]


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import issparse

# select numeric columns correctly
numerical_data = X.select_dtypes(include=['number'])
categorical_data = X.select_dtypes(include=['object'])

numerical_pipeline = Pipeline(
    [
        ("imputer" , SimpleImputer(strategy="median")),
        ("standard_scaler" , StandardScaler())
    ]
)
categorical_pipeline = Pipeline(
    [
        ("imputer" , SimpleImputer(strategy="most_frequent")),
        ("encoder" , OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocesser = ColumnTransformer(
    [
        ("numerical_pipeline" , numerical_pipeline , numerical_data.columns.tolist()),
        ("categorical_pipeline" , categorical_pipeline , categorical_data.columns.tolist())
    ]
)

X_transformed = preprocesser.fit_transform(X)


if issparse(X_transformed):
    X_transformed = X_transformed.toarray()

actual_df = pd.DataFrame(X_transformed, columns=preprocesser.get_feature_names_out(), index=X.index)

In [7]:
#model training on the transformed data
# Linear Regression
# Decision Tree Regressor
# Random Forest Regressor 
import numpy as np
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

linear_regression_model  = LinearRegression()
decision_tree_model = DecisionTreeRegressor()
random_forest_model = RandomForestRegressor()

# now going for training
linear_regression_model.fit(actual_df, Y)
decision_tree_model.fit(actual_df, Y)
random_forest_model.fit(actual_df, Y)

# now predicting on the training data
linear_regression_predictions = linear_regression_model.predict(actual_df)
decision_tree_model_predictions = decision_tree_model.predict(actual_df)
random_forest_model_predictions = random_forest_model.predict(actual_df)

# errors of the model: compute RMSE as sqrt of MSE (avoid using `squared=` in case sklearn version doesn't support it)
linear_mse = mean_squared_error(Y, linear_regression_predictions)
decision_tree_mse = mean_squared_error(Y, decision_tree_model_predictions)
random_forest_mse = mean_squared_error(Y, random_forest_model_predictions)

linear_regression_model_mser = np.sqrt(linear_mse)
decision_tree_model_mser = np.sqrt(decision_tree_mse)
random_forest_model_mser = np.sqrt(random_forest_mse)

print(f"the rmseerror of linear regression model is {linear_regression_model_mser}")
print(f"the rmseerror of decision tree model is {decision_tree_model_mser}")
print(f"the rmseerror of random forest model is {random_forest_model_mser}")



the rmseerror of linear regression model is 68433.93736666226
the rmseerror of decision tree model is 0.0
the rmseerror of random forest model is 18267.06196445952


In [8]:
#applying cross validation on decision tree and random forest 
from sklearn.model_selection import cross_val_score
decision_tree_model_cv_scores = cross_val_score(
    decision_tree_model,
    actual_df,
    Y,
    scoring="neg_mean_squared_error",
    cv=5
)
random_forest_model_cv_scores = cross_val_score(
    random_forest_model,
    actual_df,
    Y,
    scoring="neg_mean_squared_error",
    cv=5
)
print(f"the cross validated scores of decision tree model are {-decision_tree_model_cv_scores}")

the cross validated scores of decision tree model are [5.03209534e+09 4.65862308e+09 4.76593824e+09 4.63276559e+09
 4.91800528e+09]
