In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('default', category=FutureWarning)

# Load dataset
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [7]:
# Display basic information about the dataset
df.info() #number of entries, columns, non-null count, dtypes
df.describe() #mean count std min 25% 50% 75% max
df.head() #first 5 rows
df.tail() #last 5 rows
df.columns #list of column names
df.isnull().sum() #check for missing values in each column  
df.nunique() #number of unique values in each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [None]:
#train-test split
X = df.drop('charges',axis=1) #axis=1 means drop column
X = pd.get_dummies(X,drop_first=True) #convert categorical variables to dummy/indicator variables

y = df['charges']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42) #20% test size random state =42 means reproducible results,when you run multiple times you get same split

#Model training and evaluation
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')

#Checking if the model is Overfitting or underfitting
test_r2 = model.score(X_test,y_test)
print(f'Test R2 Score: {test_r2}')
train_r2 = model.score(X_train,y_train)
print(f'Train R2 Score: {train_r2}')

#if train_r2 is significantly higher than test_r2 then model is overfitting
#if both r2 scores are low then model is underfitting



Mean Absolute Error: 4181.1944737536505
Mean Squared Error: 33596915.85136146
R2 Score: 0.7835929767120723
Test R2 Score: 0.7835929767120723
Train R2 Score: 0.7417255854683333
