In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import missingno as mn
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split

# Predicting Medical insurance
[Dataset](https://www.kaggle.com/mirichoi0218/insurance)

Features used:
1. Age
2. Sex(converted to binary)
3. Body Mass Index
4. No of children
5. Does the person smoke?

Model : `Multivariate linear regression`

In [None]:
data = pd.read_csv('/kaggle/input/insurance/insurance.csv')
data.head(10)

## Checking if we need to clean data

In [None]:
mn.matrix(data)
plt.show()

## Converting sex data and smoker data to binary

In [None]:
data['smoker'] = np.where(data['smoker'] == "yes",1,0)
data['sex'] = np.where(data['sex'] == "female", 1, 0)
data

## Plotting each feature wrt label

In [None]:
mpl.rcParams['figure.dpi'] = 300
plt.rcParams.update({'font.size': 5})
np.random.seed(19680801)
colors = np.random.rand(len(data))
plt.figure(figsize=(5, 4))
plt.subplot(3, 2, 1)
plt.scatter(data['age'],data['charges'],s=5,c=colors,alpha=0.5)
plt.xlabel('Age')
plt.subplot(3, 2, 2)
plt.scatter(data['sex'],data['charges'],s=5,c=colors,alpha=0.5)
plt.xlabel('sex(1 is female)')
plt.subplot(3, 2, 3)
plt.scatter(data['bmi'],data['charges'],s=5,c=colors,alpha=0.5)
plt.xlabel('BMI')
plt.subplot(3, 2, 4)
plt.scatter(data['children'],data['charges'],s=5,c=colors,alpha=0.5)
plt.xlabel('No of children')
plt.subplot(3, 2, 5)
plt.scatter(data['smoker'],data['charges'],s=5,c=colors,alpha=0.5)
plt.xlabel('smoker?')
plt.tight_layout()
plt.show()

In [None]:
X = np.array(data.iloc[:, :5])
y = np.array(data['charges'])
Xn = np.ones((X.shape[0],X.shape[1]+1))
Xn[:,1:] = X
X = Xn
theta = np.zeros((X.shape[1],1))
alpha = 0.01

## feature normalization so that gradient decent is faster

In [None]:
mu = np.mean(X,axis= 0)
sigma = np.std(X, axis = 0)
X[:,1] = (X[:,1]-mu[1])/sigma[1]
X[:,3] = (X[:,3]-mu[3])/sigma[3]

## Dividing in test and train datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
m = X_train.shape[0]
y_train = y_train.reshape((y_train.shape[0],1))
y_test = y_test.reshape((y_test.shape[0],1))

## Manual Gradient Decent

In [None]:
J_values = np.zeros(3000)
for i in range(3000):
    J_values[i] = (np.sum((X_train.dot(theta)-y_train)**2))/(2*m)
    theta = theta - (alpha/m)*((X_train.T).dot((X_train.dot(theta)-y_train)))
plt.plot(range(3000),J_values)
plt.xlabel('iteration')
plt.ylabel('Cost function')
plt.show()

In [None]:
y_predicted = X_test.dot(theta)
rmse = np.sqrt((np.sum((y_predicted-y_test)**2))/y_predicted.shape[0])
print(theta)
rmse

In [None]:
for i in range(20):
    print(f"{y_predicted[i]} {y_test[i]}")

## Regression using sklearn's inbuilt method

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
regr = LinearRegression()
regr.fit(X_train,y_train)
y_predicted = regr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_predicted)))
print(regr.coef_)

In [None]:
for i in range(20):
    print(f"{y_predicted[i]} {y_test[i]}")