In [1]:
# Think of pandas as a super-smart Excel helper for Python
# It brings pandas into your program
# Pandas helps you: Read data (CSV, Excel, etc.), Work with tables (rows & columns), Clean, sort, filter, and analyze data
import pandas as pd
# Opens Salary_Data.csv, Reads all the data inside it, Converts it into a table format Python understands
dataset=pd.read_csv("insurance_pre.csv") # read_csv means “read a CSV file”
dataset # The data is stored inside a variable named dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
dataset=pd.get_dummies(dataset,drop_first=True)
dataset = dataset.astype(int) # forces all dummy columns to int
dataset
# pd.get_dummies() 
#This is a Pandas function used to convert categorical variables (like "red", "blue", "green") into dummy/indicator variables (also called one-hot encoding).
# Each category becomes a new column with values 0 or 1 indicating whether that category is present.

# dataset
# The original DataFrame you’re working with.
# After applying pd.get_dummies(), it transforms categorical columns into numeric dummy columns.

# drop_first=True
# Instead of creating a dummy column for every category, it drops the first category and uses the remaining ones.

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0
...,...,...,...,...,...,...
1333,50,30,3,10600,1,0
1334,18,31,0,2205,0,0
1335,18,36,0,1629,0,0
1336,21,25,0,2007,0,0


In [5]:
dataset.columns # is asking pandas to show you the column labels (names of all the fields) in your DataFrame dataset.

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [6]:
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
# This selects only one column from the table
# Double square brackets [[ ]] mean: “Give me this column as a table, not a single list”
# Stores that column in a variable called independent This is also called: Feature, Input (cause), X
dependent=dataset[['charges']] # Stores that column in a variable called dependent This is also called: Target, Output(effect), Label, y

In [7]:
from sklearn.model_selection import train_test_split # scikit-learn (sklearn) = ML toolkit 
# train_test_split = a machine that splits data # It is a function provided by scikit-learn
# Think of it as a divider that random separates data for: Learning and Testing 
x_train,x_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.20,random_state=0)
# independent (X) → Features
# dependent (y) → Target
# test_size = 0.20 # Means 20% of total data is used for testing # Remaining 80% is used for training
# random_state = 0 # random_state is a seed value that controls randomness.# shuffle in exact same order every time
# x_train -> Training model
# y_train -> Teaching model
# x_test -> Testing model
# y_test -> Comparing predictions

In [8]:
from sklearn.preprocessing import StandardScaler # Imports the StandardScaler class from scikit‑learn.
# This tool is used to standardize features (scale them so they have mean = 0 and standard deviation = 1).
sc=StandardScaler() # Creates a scaler object named sc. # Think of it as a “machine” that learns how to scale your data.
x_train=sc.fit_transform(x_train)
# Fit → calculates the mean and standard deviation of each feature in your training data.
# Transform → uses those values to scale the training data.
x_test=sc.transform(x_test) # you only transform the test set, not fit it again.

In [9]:
x_train

array([[-0.14853305,  0.60616004,  2.41394802,  1.01506676,  1.97125461],
       [-1.49780784,  0.60616004, -0.89219519,  1.01506676, -0.50729114],
       [-1.14273553,  0.92836135,  0.76087642, -0.98515688,  1.97125461],
       ...,
       [ 0.06451033, -0.84374587, -0.89219519,  1.01506676, -0.50729114],
       [-1.42679338,  0.7672607 , -0.89219519,  1.01506676, -0.50729114],
       [-0.4325909 , -1.97145047, -0.06565939, -0.98515688, -0.50729114]])

In [110]:
from sklearn.ensemble import RandomForestRegressor # Imports the regression version of Random Forest from scikit‑learn.
# Random Forest is an ensemble method: it builds many decision trees and averages their predictions.

# regressor = RandomForestRegressor(n_estimators = 10, random_state = 0) # Creates the Random Forest model object.
# n_estimators=50 → Builds 50 decision trees in the forest. More trees generally improve accuracy but increase computation time.
# random_state=0 → Ensures reproducibility. Running the code again will give the same result.

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# criterion{“squared_error”, “absolute_error”, “friedman_mse”, “poisson”}, default=”squared_error”

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0, criterion = 'absolute_error', max_features = 'log2') 

# regressor = RandomForestRegressor(n_estimators = 10, random_state = 0, criterion = 'absolute_error') 

regressor.fit(x_train, y_train) # Trains the Random Forest on your training data.
# Each tree is trained on a random subset of the data and features.
# The final prediction is the average of all trees’ predictions (for regression).

  return fit_method(estimator, *args, **kwargs)


In [111]:
y_pred=regressor.predict(x_test) # .predict() uses the trained model with x_test # x_test was never seen during training

In [112]:
from sklearn.metrics import r2_score # model evaluation step
r_score=r2_score(y_test,y_pred) # R² Score (Coefficient of Determination)
# y_test → Actual 
# y_pred → Predicted

In [113]:
r_score

0.8855523620042153

In [114]:
import pickle # pickle is a Python library, Used to save objects to a file, And load them back later
filename="finalized_model_random_forest.sav" # This is just a file name .sav means saved model 
#You could also use: .pkl , .pickle (File extension doesn’t matter to Python — it’s for humans)

In [115]:
pickle.dump(regressor,open(filename,'wb')) # dump() serializes a Python object and writes it to a file
# pickle.dump(object, file) # object → The Python object to save (regressor) # file → The file handle opened in write-binary mode ('wb')
# Here, the object = regressor (your trained Linear Regression model)

In [119]:
loaded_model=pickle.load(open("finalized_model_random_forest.sav",'rb')) 
# Opens the saved file ('rb' = read binary)
# Loads the trained model into Python
# loaded_model now behaves exactly like the original regressor
# No need to retrain!
# result=loaded_model.predict([[1234,345,4565,1,0]]) # Making a Prediction # Returns predicted salary in result

new_input = pd.DataFrame([[18,25,0,1,0]], columns=['age', 'bmi', 'children', 'sex_male', 'smoker_yes'])
result = loaded_model.predict(new_input)



In [120]:
result

array([15782.64])