# **Medical Cost Insurance Prediction**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as graph

dataset = pd.read_csv('../input/insurance/insurance.csv')

print (dataset)

print (dataset.shape, '\n \n')

dataset.info()

***Handling Missing Data***

In [None]:
dataset.isnull().sum()

# **Pre Processing**

# Label Encoding for Categorical Variable

**Checking Unique Values**

In [None]:
for col in dataset.columns:
    if (dataset[col].dtype == object):
        print (col, ':', dataset[col].unique())

In [None]:
from sklearn import preprocessing

def labelEncoder(dataset, col):
    encoder = preprocessing.LabelEncoder()
    encoder.fit(dataset[col])
    return encoder.transform(dataset[col])

In [None]:
dataset['sex'] = labelEncoder(dataset, 'sex')
dataset['smoker'] = labelEncoder(dataset, 'smoker')
dataset['region'] = labelEncoder(dataset, 'region')

In [None]:
dataset.info()

### Train, Test Split and Model Creation

In [None]:
from sklearn.model_selection import train_test_split

x = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [None]:
x_train.shape, x_test.shape # 70 % 30 %

In [None]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)

In [None]:
y_test_predicted = linear_regression.predict(x_test)

### Cross Validation

In [None]:
from sklearn.metrics import r2_score

score = r2_score(y_test, y_test_predicted)

print ('\n R2 Score : ', round(score * 100), ' %')

In [None]:
dataset.head()

In [None]:
linear_regression.predict([[19, 0, 27.900, 0, 1, 3]])

In [None]:
fig = graph.figure(figsize=(7, 7))

axis = fig.add_subplot(111)

# Kaggle dark theme fix
axis.tick_params(axis='x', colors='red', labelsize=15)
axis.tick_params(axis='y', colors='red', labelsize=15)

graph.scatter([i for i in range(0, 20)], y_test[:20], color='blue')
graph.scatter([i for i in range(0, 20)], y_test_predicted[:20], color='red')

graph.legend(['Test Data', 'Predicted'])

print ('\n R2 Score : ', round(score * 100), ' %')