In [13]:
import numpy as np
import pandas as pd

In [15]:
df=pd.read_csv("insurance.csv")
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [19]:
df=pd.get_dummies(df,columns=["sex","smoker","region"])
df.head(5)

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,True,False,False,True,False,False,False,True
1,18,33.77,1,1725.5523,False,True,True,False,False,False,True,False
2,28,33.0,3,4449.462,False,True,True,False,False,False,True,False
3,33,22.705,0,21984.47061,False,True,True,False,False,True,False,False
4,32,28.88,0,3866.8552,False,True,True,False,False,True,False,False


In [21]:
x=df.drop(columns="charges")
y=df["charges"]

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [25]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(x_train,y_train)

In [27]:
y_pred=lr.predict(x_test)

In [29]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7835929767120722

In [33]:
# RandomizedSearchCV
# It is a technique used in Machine Learning to find the best hyperparameters for a model.
# Instead of trying all possible combinations (like GridSearchCV does), it picks random combinations from the given parameter values and tests them.
# This makes it faster when the parameter space is large.
# Example: Testing random values of learning rate, number of trees, or alpha in regression.
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Lasso
parameterdict={"alpha":[0.1,1.0,10.0,100.0]}
lasso=Lasso()
random_search=RandomizedSearchCV( lasso, parameterdict, n_iter=2, cv=3)
random_search.fit(x_train,y_train)

# it’s compulsory that the first argument is the estimator (model) i.e. lasso here in RandomizedSearchCV.

#n_iter=2:
# it Means how many random combinations of hyperparameters to try.
# Example: if your dictionary has 10 possible values, but n_iter=2, it will only pick 2 random ones (not all 10).
# This makes it faster than testing everything.
# 👉 So here it will randomly test 2 values of alpha from [0.1, 1.0, 10.0, 100.0].





# 🔹 cv=4
# It means 4-fold cross validation.
# Your training data is split into 4 equal parts (folds).
# For each hyperparameter setting:
# Train on 3 folds
# Test on the 1 remaining fold
# Repeat this 4 times (so each fold becomes the test once).
# Final score = average of all 4 test results.

# Say your training data has 100 samples:
# Split into 4 folds → each has 25 samples.
# Iteration 1 → Train on Fold1+Fold2+Fold3, Test on Fold4
# Iteration 2 → Train on Fold1+Fold2+Fold4, Test on Fold3
# Iteration 3 → Train on Fold1+Fold3+Fold4, Test on Fold2
# Iteration 4 → Train on Fold2+Fold3+Fold4, Test on Fold1
# 👉 So the model is trained 4 times and tested on every data point once.

# cv → iterations for cross-validation (per hyperparameter value)
# n_iter → how many hyperparameter values to test

# 📌 How cv works with train/test data
# Usually, you already split your dataset into:
# Training set (e.g., 80%)
# Test set (e.g., 20%)
# Then, cross-validation (cv) is applied only on the training set.
# The test set is kept aside until the very end (for final evaluation).
    
# In k-fold cross validation (cv = k):
# The dataset is split into k folds (parts).
# In each iteration:
# 1 fold = used for testing
# remaining k−1 folds = used for training

In [35]:
y_pred_new=random_search.predict(x_test)

In [37]:
r2_score(y_test,y_pred)

0.7835929767120722

In [51]:
# 🔹 GridSearchCV: 
#It is also a technique to find the best hyperparameters for a model
# It tries all possible combinations of the hyperparameters you provide in a dictionary and then evaluating them using cross-validation.
# Finally, it gives you the best parameters and the best model.

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
hyperparametersgrid={"alpha":[0.1,1.0,10.0,100.0]}
ridge=Ridge()
gridsearch=GridSearchCV(ridge,hyperparametersgrid,cv=4)
gridsearch.fit(x_train,y_train)

# GridSearchCV → tries all combinations of hyperparameters → so there’s no need for n_iter.
# RandomizedSearchCV → tries only a random subset of combinations → that’s why you must specify n_iter (how many random trials).

In [55]:
y_pred_grid=gridsearch.predict(x_test)
r2_score(y_test,y_pred_grid)

0.7834446266673822