<a href="https://colab.research.google.com/github/solar-node/ML/blob/main/InsuranceCostPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Data collection and processing
insurance_dataset=pd.read_csv("/content/drive/MyDrive/ML Projects/insurance.csv")
insurance_dataset.head()

In [None]:
# no. of rows and columns
insurance_dataset.shape

In [None]:
# Getting some info. about dataset
insurance_dataset.info()

categorical features:
- Sex/Gender
- Smoker
- Region

Target: Charges

In [None]:
# Checking for missing value
insurance_dataset.isnull().sum()

### Data Analysis

In [None]:
# Statistical measure of the dataset
insurance_dataset.describe()

In [None]:
# Distrubution of age value
sns.set()
plt.figure(figsize=(8,6))
sns.distplot(insurance_dataset['age'])

plt.title('Age distribution')
plt.show()

In [None]:
# Gender column
plt.figure(figsize=(6,6))
sns.countplot(x='sex',data=insurance_dataset,palette="viridis")

plt.title('Gender distribution')
plt.show()

In [None]:
insurance_dataset['sex'].value_counts()

In [None]:
# BMI distribution in dataset
plt.figure(figsize=(8,6))
sns.distplot(insurance_dataset['bmi'])

plt.title('BMI distribution')
plt.show()

Normal BMI range : 18.5 -> 25

In [None]:
# Children column
plt.figure(figsize=(6,6))
sns.countplot(x='children',data=insurance_dataset,palette="viridis")

plt.title('children')
plt.show()

In [None]:
insurance_dataset['children'].value_counts()

In [None]:
# Smoker column
plt.figure(figsize=(6,6))
sns.countplot(x='smoker',data=insurance_dataset,palette="viridis")

plt.title('Smoker distribution')
plt.show()

In [None]:
insurance_dataset['smoker'].value_counts()

In [None]:
# Region column
plt.figure(figsize=(6,6))
sns.countplot(x='region',data=insurance_dataset,palette="viridis")

plt.title('Region distribution')
plt.show()

In [None]:
insurance_dataset['region'].value_counts()

In [None]:
# For age and BMI count plot will not be good because of many values
# USe distribution plot

# Charges distribution
plt.figure(figsize=(8,6))
sns.distplot(insurance_dataset['charges'])

plt.title('Charge distribution')
plt.show()

## Data Pre-processing


### Encoding the categorical features:


In [None]:

# Encoding the gender/sex column
insurance_dataset.replace({'sex':{'male':0,'female':1}},inplace=True)

# Encoding smoker column
insurance_dataset.replace({'smoker':{'yes':0,'no':1}},inplace=True)

# Encoding region column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}},inplace=True)

insurance_dataset


In [None]:
# Splitting the features and target
X=insurance_dataset.drop(columns='charges',axis=1)  #X contains all the other features not target
Y=insurance_dataset['charges']  #Y contains the targert(charges )
print(X)


In [None]:
print(Y)

In [31]:
# Splitting the data into training data and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,  random_state=2)

In [None]:
# Knowing the shape of our train and test data
print(X.shape,X_train.shape, X_test.shape)
print(Y.shape,Y_train.shape, Y_test.shape)

## Model Training


In [33]:
# Linear regression model training

# Loading the Linear regression model
regressor= LinearRegression()

In [None]:
# Fitting the training data into regress or to make the line of regression by using the points
regressor.fit(X_train,Y_train)

# Now model has been trained and its time to evaluate/test

### Model Evaluation

In [35]:
# Prediction on training data
training_data_prediction = regressor.predict(X_train)


In [None]:
# R squared value : if value is close to 1 then our model is performing well.
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R Squared value : ',r2_train)

In [37]:
# Prediction on testing  data
test_data_prediction = regressor.predict(X_test )

In [38]:
# R squared value : if value is close to 1 then our model is performing well.
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R Squared value : ',r2_test)

R Squared value :  0.7447273869684076


## Building a prediction system

In [39]:
# for input
# 'male':0,'female':1
# 'yes':0,'no':1
# 'southeast':0,'southwest':1,'northeast':2,'northwest':3
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# Building a system which can predict insurance based on the features in input
input_data=(25,0,26.22,0,1,2)

# changing input_data (tuple) to numpy array
input_data_np=np.asarray(input_data)

input_data_np


In [None]:
# Reshaping the array
input_data_reshaped = input_data_np.reshape(1,-1)
input_data_reshaped

In [42]:
# Predicting from the model
prediction = regressor.predict(input_data_reshaped)
print('The insurance cost is : ',prediction[0])

The insurance cost is :  2808.687798001414
