In [246]:
# Think of pandas as a super-smart Excel helper for Python
# It brings pandas into your program
# Pandas helps you: Read data (CSV, Excel, etc.), Work with tables (rows & columns), Clean, sort, filter, and analyze data
import pandas as pd
# Opens Salary_Data.csv, Reads all the data inside it, Converts it into a table format Python understands
dataset=pd.read_csv("insurance_pre.csv") # read_csv means “read a CSV file”
dataset # The data is stored inside a variable named dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [247]:
dataset=pd.get_dummies(dataset,drop_first=True)
dataset = dataset.astype(int) # forces all dummy columns to int
dataset
# pd.get_dummies() 
#This is a Pandas function used to convert categorical variables (like "red", "blue", "green") into dummy/indicator variables (also called one-hot encoding).
# Each category becomes a new column with values 0 or 1 indicating whether that category is present.

# dataset
# The original DataFrame you’re working with.
# After applying pd.get_dummies(), it transforms categorical columns into numeric dummy columns.

# drop_first=True
# Instead of creating a dummy column for every category, it drops the first category and uses the remaining ones.

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0
...,...,...,...,...,...,...
1333,50,30,3,10600,1,0
1334,18,31,0,2205,0,0
1335,18,36,0,1629,0,0
1336,21,25,0,2007,0,0


In [248]:
dataset.columns # is asking pandas to show you the column labels (names of all the fields) in your DataFrame dataset.

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [249]:
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
# This selects only one column from the table
# Double square brackets [[ ]] mean: “Give me this column as a table, not a single list”
# Stores that column in a variable called independent This is also called: Feature, Input (cause), X
dependent=dataset[['charges']] # Stores that column in a variable called dependent This is also called: Target, Output(effect), Label, y

In [250]:
from sklearn.model_selection import train_test_split # scikit-learn (sklearn) = ML toolkit 
# train_test_split = a machine that splits data # It is a function provided by scikit-learn
# Think of it as a divider that random separates data for: Learning and Testing 
x_train,x_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.20,random_state=0)
# independent (X) → Features
# dependent (y) → Target
# test_size = 0.20 # Means 20% of total data is used for testing # Remaining 80% is used for training
# random_state = 0 # random_state is a seed value that controls randomness.# shuffle in exact same order every time
# x_train -> Training model
# y_train -> Teaching model
# x_test -> Testing model
# y_test -> Comparing predictions

In [251]:
from sklearn.preprocessing import StandardScaler # Imports the StandardScaler class from scikit‑learn.
# This tool is used to standardize features (scale them so they have mean = 0 and standard deviation = 1).
sc=StandardScaler() # Creates a scaler object named sc. # Think of it as a “machine” that learns how to scale your data.
x_train=sc.fit_transform(x_train)
# Fit → calculates the mean and standard deviation of each feature in your training data.
# Transform → uses those values to scale the training data.
x_test=sc.transform(x_test) # you only transform the test set, not fit it again.

In [252]:
x_train

array([[-0.14853305,  0.60616004,  2.41394802,  1.01506676,  1.97125461],
       [-1.49780784,  0.60616004, -0.89219519,  1.01506676, -0.50729114],
       [-1.14273553,  0.92836135,  0.76087642, -0.98515688,  1.97125461],
       ...,
       [ 0.06451033, -0.84374587, -0.89219519,  1.01506676, -0.50729114],
       [-1.42679338,  0.7672607 , -0.89219519,  1.01506676, -0.50729114],
       [-0.4325909 , -1.97145047, -0.06565939, -0.98515688, -0.50729114]])

In [353]:
from sklearn.tree import DecisionTreeRegressor # Imports the regression version of decision trees from scikit‑learn.

# Below creates a decision tree model object.
# regressor = DecisionTreeRegressor(criterion='friedman_mse', splitter='best')   # Friedman MSE: variant of mean squared error, splits chosen randomly
# regressor = DecisionTreeRegressor(criterion='squared_error', splitter='random')    # Squared error: standard MSE, splits chosen for best reduction
# regressor = DecisionTreeRegressor(criterion='absolute_error', splitter='best') # Absolute error: minimizes mean absolute error, splits chosen randomly
# regressor = DecisionTreeRegressor(criterion='poisson', splitter='best')        # Poisson: suited for count data regression, splits chosen randomly

# regressor = DecisionTreeRegressor(random_state=0, criterion='poisson', splitter='random',max_features = 'log2')

regressor = DecisionTreeRegressor(random_state=0, criterion='squared_error', splitter='random')

# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
# criterion{“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”squared_error”
# splitter{“best”, “random”}, default=”best”

regressor=regressor.fit(x_train,y_train)

In [354]:
y_pred=regressor.predict(x_test) # .predict() uses the trained model with x_test # x_test was never seen during training

In [355]:
from sklearn.metrics import r2_score # model evaluation step
r_score=r2_score(y_test,y_pred) # R² Score (Coefficient of Determination)
# y_test → Actual 
# y_pred → Predicted

In [356]:
r_score

0.6479330270425051