In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Data load

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

* Categorical Features - based on the df.info(), we see that the below features are of `object` type.
        * Sex
        * Smoker
        * Region
        
* Numerical Features - non `object` type
        * Age
        * BMI
        * Children
       
* Target 
        * Charges
        

# CHeck for missing values.
Though from df.info() we got to know there are no missing or NaN data, but still checking via `isnull()`

In [None]:
df.isnull().sum()

# Data Visualization

## Statistical Measures of the data set

In [None]:
df.describe()

## Distribution of Age

In [None]:
sns.set()
plt.figure(figsize = (6,6))
# sns.distplot(df.age)   # distplot will get deprecated in future versions. 
# sns.distplot(df['age'])

sns.displot(df.age)
plt.title('Age Distribution')
plt.show()

In [None]:
# Gender
plt.figure(figsize = (6,6))
sns.countplot(x = 'sex',data = df)
plt.title("Gender DIstribution")
plt.show()

In [None]:
df['sex'].value_counts()

In [None]:
# BMI DIstribution
plt.figure(figsize = (6,6))
sns.displot(df.bmi)
plt.title('BMI Distribution')
plt.show()

For a person, normal BMI Range should be in range 18.5 and 24.9.

* If a person has BMI below 18.5, (s)he will consider as under weight.
* and is BMI is above 24.9 then its consider as Over weight.

In [None]:
# Children Distribution
plt.figure(figsize = (6,6))
sns.countplot(x = 'children',data = df)
plt.title("CHildren DIstribution")
plt.show()

In [None]:
df['children'].value_counts()

In [None]:
# Smoker Distribution
plt.figure(figsize = (6,6))
sns.countplot(x = 'smoker',data = df)
plt.title("Smoker DIstribution")
plt.show()

In [None]:
df['smoker'].value_counts()

In [None]:
# Region Distribution
plt.figure(figsize = (6,6))
sns.countplot(x = 'region',data = df)
plt.title("Region DIstribution")
plt.show()

In [None]:
df['region'].value_counts()

In [None]:
# Target DIstribution
plt.figure(figsize = (6,6))
sns.displot(df.charges)
plt.title('Charges Distribution')
plt.show()

# Data Pre-Processing
## ENcoding the Categorical Features

In [None]:
# ENcoding the Sex feature
df.replace({'sex': {'male' : 0, 'female': 1}}, inplace = True)
df.head()

In [None]:
# ENcoding the Smoker feature
df.replace({'smoker': {'no' : 0, 'yes': 1}}, inplace = True)
df.head()

In [None]:
# ENcoding the Region feature
df.replace({'region': {'southeast' : 0, 'southwest': 1, 'northeast': 2, 'northwest':3}}, inplace = True)
df.head()

# Split into Train Test

In [None]:
X = df.drop(columns = 'charges', axis = 1)
y = df.charges

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 99)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Model Training
## Linear Regression

In [None]:
# Load or instiate Linear Regression 
lr = LinearRegression()
lr.fit(X_train, y_train)

# Model Evaluation

In [None]:
# Evaluate on Train Set
pred_train = lr.predict(X_train)

In [None]:
# R Squared
r2_train = metrics.r2_score(y_train, pred_train)
print(r2_train)

In [None]:
# Evaluate on Test Set
pred_test = lr.predict(X_test)

# R Squared
r2_test = metrics.r2_score(y_test, pred_test)
print(r2_test)

R2 value for Train data and Test Data are nearly same, so could say our model did not neither over train nor under train (aka overfitting or underfitting).

# Predict on Unseen Data

In [None]:
# age         : 37
# sex         : male (0)
# bmi         : 28.025
# children    : 2
# smoker      : no (0)
# region      : northeast (2)
# charges     : expected --> 6203.90175
input_data = (37, 0, 28.025, 0, 0, 2)
print(input_data)

# Changing input data to numpy array
input_data_arr = np.asarray(input_data)

print(input_data_arr)

# Reshape the Array
input_data_reshp = input_data_arr.reshape(1,-1) 

print(input_data_reshp)

In [None]:
pred_new = lr.predict(input_data_reshp)
print(pred_new)

In [None]:
print('The Insurance Cost is $', pred_new[0])