In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**IMPORTING DATASET**

In [None]:
df=pd.read_csv("/kaggle/input/insurance/insurance.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

As the columns "sex" and "smoker" are categorical with only two different values so we will apply Label Encoder to it.

In [None]:
df_columns=["sex","smoker"]

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in df_columns:
    df[i]=le.fit_transform(df[i])

In [None]:
df.head()

Now, we have a column "region" which has 4 different categorical values:

->northeast

->northwest

->southeast

->southwest


**So we will apply One Hot Encoding**

In [None]:
df_region=pd.get_dummies(df.region)
df=pd.concat([df,df_region],axis=1)
df=df.drop(["region"],axis=1)

In [None]:
df.head()

In [None]:
df.shape

**DATA VISUALISATION**

In [None]:
corr=df.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True)

We came to know from the above observation that "charges" column have a very strong/high correlation with the "smoker" column.

In [None]:
sns.countplot(x ='smoker', data = df)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(df["age"])

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(df["children"])

In [None]:
ax = sns.stripplot(x='smoker', y='charges', data=df)

In [None]:
sns.stripplot(x='children', y='charges', data=df,
              jitter=True, hue='smoker', dodge=True)

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(x='smoker', y='charges', data=df,
               hue='sex', split=True)

In [None]:
df_columns=["age","sex","bmi","children","charges"]

In [None]:
for i in df_columns:
    g = sns.FacetGrid(df, col='smoker')
    g = g.map(sns.kdeplot, i)

In [None]:
sns.pairplot(df.drop(["northeast","northwest","southeast","southwest"],axis=1),hue="smoker",palette="coolwarm")

In [None]:
sns.scatterplot(x="bmi",y="charges",hue="smoker",data=df)

In [None]:
sns.scatterplot(x="charges",y="bmi",hue="children",data=df)

In [None]:
sns.barplot(x ='children', y ='charges', data = df,palette ='plasma')

In [None]:
plt.subplot(111)
sns.distplot(df["charges"], bins=10, kde=True)
plt.show()

plt.subplot(121)
sns.distplot(df["age"],bins=10,kde=True)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot('age', 'charges', data=df)

In [None]:
plt.figure(figsize=(20,10))
sns.stripplot(x='age', y='charges', data=df)

**It infers that as the age grows the charges also increases**

In [None]:
df_columns=["age","sex","bmi","children","smoker"]

In [None]:
for feature in df_columns:
    data=df.copy()
    plt.figure(figsize=(10,10))
    data.groupby(feature)["charges"].median().plot.bar()
    plt.ylabel("charges")
    plt.title(feature)
    plt.show()

**SPLITTING FEATURES INTO INDEPENDENT AND DEPENDENT VARIABLES**

In [None]:
X=df.drop(["charges"],axis=1)
y=df["charges"]

**SPLITTING THE DATASET INTO TRAINING AND TEST SET**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

**TRAINING THE DATASET ON LINEAR REGRESSION MODEL**

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)

**PREDICTING THE RESULT**

In [None]:
lr_pred = lr.predict(X_test)

In [None]:
print(lr.score(X_test,y_test))

We used the linear regression model and got a score of 79%

**NOW TRYING A DIFFERENT MODEL**

**TRAINING THE DATASET INTO TEST SET AND TRAIING SET**

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X, y)

**PREDICTING THE RESULT**

In [None]:
regressor_pred = regressor.predict(X_train)

In [None]:
regressor.score(X_test,y_test)

We used the random forest model and got a score of 97%, which is petty good.

In [None]:
from sklearn.metrics import r2_score,mean_squared_error
print('MSE train data: %.3f' % (
mean_squared_error(y_train,regressor_pred)))
print('R2 train data: %.3f' % (
r2_score(y_train,regressor_pred)))

So at last we got the mean square error as 5098416.425
and R2 score as 96%