In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
### Reading and Understanding the data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [None]:
df= pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check for null count column wise
df.isnull().sum(axis=0)

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(df['charges'])
plt.show()

### Visualizing the categorical data
    - sex
    - smoker
    - region

In [None]:
plt.figure(figsize=(18,4))
plt.subplot(131)
sns.barplot(x='sex', y='charges', data=df)
plt.subplot(132)
sns.barplot(x='smoker', y='charges', data=df)
plt.subplot(133)
sns.barplot(x='region', y='charges', data=df)
plt.show()

### Visualizing Numerical data
    - age
    - bmi
    - children
    - charges

In [None]:
sns.pairplot(df)

In [None]:
#Plot a heatmap and look at the corelation
sns.heatmap(df.corr(), cmap='coolwarm',annot=True)

### Dealing with categorical data- dummy variable creation

In [None]:
# Let us map the variables with 2 levels to 0 and 1
df['sex']=df['sex'].map({'male':1, 'female':0})
df['smoker']=df['smoker'].map({'yes':1,'no':0})

In [None]:
# Assigning dummy variables to remaining categorical variable- region
df = pd.get_dummies(df, columns=['region'], drop_first=True)
df.head()

### Splitting the Data into Training and Testing Sets

In [None]:
df_train, df_test= train_test_split(df, train_size=0.7, random_state=100)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
## Rescaling features using min-max scaling
scaler = MinMaxScaler()
num_vars = ['age','bmi','children','charges']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.describe()

In [None]:
y_train = df_train.pop('charges')
X_train = df_train

In [None]:
y_train.head()

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)

In [None]:
list(zip(X_train.columns,lm.coef_))

In [None]:
y_train_pred= lm.predict(X_train)
res= y_train- y_train_pred
sns.distplot(res)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
df_test[num_vars] = scaler.transform(df_test[num_vars])
df_test.describe()

In [None]:
y_test = df_test.pop('charges')
X_test = df_test

In [None]:
y_test_pred= lm.predict(X_test)
r2_score(y_test,y_test_pred)

In [None]:
# Plotting y_test and y_test_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_test_pred)
plt.xlabel('y_test', fontsize=18)                          
plt.ylabel('y_test_pred', fontsize=16) 