In [17]:
# Step 1: Import the required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [9]:
# Step 2: Load the dataset into a pandas dataframe
data = pd.read_csv('../insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:

# Step 3: Encode categorical features into numerical values using one-hot encoding
categorical_features = ['sex', 'smoker', 'region']
onehot_encoder = OneHotEncoder(sparse=False)
encoded_features = onehot_encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(categorical_features))



In [11]:
# concatenate the encoded categorical features with the numerical features
numerical_features = ['age', 'bmi', 'children']
X = pd.concat([data[numerical_features], encoded_df], axis=1)

In [12]:
# the target variable
y = data['charges']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Step 5: Train a linear regression model on the training data
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [14]:
# Step 6: Evaluate the model's performance on the testing data
print('R-squared score on the testing data:', lr_model.score(X_test, y_test))

R-squared score on the testing data: 0.7835929767120723


In [18]:
pred=lr_model.predict(X_test)
mean_squared_error(y_test,pred)

33596915.85136147

In [17]:
# Step 7: Use the trained model to make predictions on new data
new_data = pd.DataFrame({
    'age': [30],
    'sex': ['male'],
    'bmi': [25.5],
    'children': [2],
    'smoker': ['no'],
    'region': ['northeast']
})

# encode the new data using the same one-hot encoder
new_encoded_features = onehot_encoder.transform(new_data[categorical_features])
new_encoded_df = pd.DataFrame(new_encoded_features, columns=onehot_encoder.get_feature_names_out(categorical_features))
new_X = pd.concat([new_data[numerical_features], new_encoded_df], axis=1)

# make predictions on the new data
predictions = lr_model.predict(new_X)
print('Predicted charges:', predictions)


Predicted charges: [5205.87807471]
