<a href="https://colab.research.google.com/github/ramesitexp/DataScience_Zone/blob/main/DTRegeressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [3]:

RANDOM_STATE = 42
rng = np.random.default_rng(RANDOM_STATE)

n_samples = 2500
n_features = 10

# Base features
X = rng.normal(0, 1, size=(n_samples, n_features))

feature_names = [
    "house_size", "bedrooms", "location_score", "age_of_house", "near_metro",
    "school_score", "crime_rate", "income_area", "parking_score", "noise_level"
]

# Create a target with mixed linear + non-linear relationships (realistic)
# y ~ linear part + interactions + piecewise effects + noise
y = (
    80 * X[:, 0]                        # house_size
    + 30 * X[:, 2]                      # location_score
    - 40 * X[:, 3]                      # age_of_house
    + 25 * X[:, 5]                      # school_score
    - 35 * X[:, 6]                      # crime_rate
    + 15 * X[:, 7]                      # income_area
    + 60 * (X[:, 4] > 0).astype(float)  # near_metro (piecewise jump)
    + 50 * (X[:, 0] * X[:, 2])          # interaction: size * location
    + 30 * np.sin(X[:, 8])              # non-linear: parking_score
    + rng.normal(0, 35, size=n_samples) # noise
)

df = pd.DataFrame(X, columns=feature_names)
df["target_price"] = y


In [4]:
df

Unnamed: 0,house_size,bedrooms,location_score,age_of_house,near_metro,school_score,crime_rate,income_area,parking_score,noise_level,target_price
0,0.304717,-1.039984,0.750451,0.940565,-1.951035,-1.302180,0.127840,-0.316243,-0.016801,-0.853044,-18.850396
1,0.879398,0.777792,0.066031,1.127241,0.467509,-0.859292,0.368751,-0.958883,0.878450,-0.049926,37.728072
2,-0.184862,-0.680930,1.222541,-0.154529,-0.428328,-0.352134,0.532309,0.365444,0.412733,0.430821,7.098640
3,2.141648,-0.406415,-0.512243,-0.813773,0.615979,1.128972,-0.113947,-0.840156,-0.824481,0.650593,182.811400
4,0.743254,0.543154,-0.665510,0.232161,0.116686,0.218689,0.871429,0.223596,0.678914,0.067579,111.376397
...,...,...,...,...,...,...,...,...,...,...,...
2495,0.450540,-0.878168,-0.349805,-0.967569,0.508339,-0.752156,-1.286733,-1.110775,-0.571088,-0.480301,159.746976
2496,0.251453,-0.161623,0.370311,0.450613,0.740500,-1.907825,-1.127223,0.554887,0.211412,0.976933,38.884940
2497,-1.544018,-0.475155,-0.296380,0.166564,0.035877,0.469078,-0.114654,1.372336,1.691209,-1.117897,1.504304
2498,0.045111,0.797994,0.585328,-0.407844,0.254865,-1.583458,-0.019142,-1.460304,0.507979,-1.115958,-6.497975


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_names].values,
    df["target_price"].values,
    test_size=0.2,
    random_state=RANDOM_STATE
)

In [8]:
decision_tree = DecisionTreeRegressor(random_state=RANDOM_STATE,max_depth=4)
decision_tree.fit(X_train, y_train)

In [9]:
mean_absolute_error(y_test, decision_tree.predict(X_test))

68.87096644497011

In [10]:
r2_score(y_test, decision_tree.predict(X_test))


0.5327487877766979

In [11]:
from sklearn.tree import export_graphviz
import graphviz
export_graphviz(decision_tree, out_file="treeregress.dot",feature_names=feature_names, filled=True, rounded=True, class_names=["target_price"])

In [12]:
from graphviz import Source

with open("treeregress.dot") as f:
    dot_graph = f.read()

graph = Source(dot_graph)
graph.format = "png"
graph.render("decision_treeregressor")

'decision_treeregressor.png'