In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sklearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('/kaggle/input/insurance-premium-prediction/insurance.csv')

In [None]:
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.subplot(1,1,1)
sns.countplot(x='sex',data=df)
#plt.subplot(1,2,2)
#sns.scatterplot(data=df, x="expenses", y="gender")

We have almost similar counts of males and females in our dataset

In [None]:
f = plt.figure(figsize=(10,3))
f.add_subplot(121)
sns.countplot(x='children',data=df)
f.add_subplot(122)
sns.scatterplot(data=df, x="expenses", y="children")

The number of persons having more than 3 children is comparatively lesser than those have less than 3 children, no trend is being observed that people with lesser children have lesser medical expense

In [None]:
f = plt.figure(figsize=(10,3))
f.add_subplot(121)
sns.countplot(x='smoker',data=df)
f.add_subplot(122)
sns.scatterplot(data=df, x="expenses", y="smoker")

Even though the data has way more number of non smokers than the number of smokers, it is observed that smokers have higher medical expenses, this feature would definitely be useful while predicting premium.

In [None]:
sns.countplot(x='region',data=df)

In [None]:
sns.scatterplot(data=df, x="expenses", y="region")

The dataset has almost same number of people for each region, No trend can be established between different regions and expenses.

In [None]:
f = plt.figure(figsize=(10,3))
f.add_subplot(121)
sns.lineplot(x='age',y='expenses',data=df)
f.add_subplot(122)
sns.lineplot(x='bmi',y='expenses',data=df)

The trend above shows that medical expenses are lower with lesser age

In [None]:
sns.lineplot(x='bmi',y='expenses',data=df)

No particular trend can be established between bmi and expenses

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df.iloc[:,1] = label.fit_transform(df.iloc[:,1])
df.iloc[:,5] = label.fit_transform(df.iloc[:,5])
df.iloc[:,4] = label.fit_transform(df.iloc[:,4])

In [None]:
import matplotlib.pyplot as plt
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

The highest correlation can be observed between expenses and age; smoker and age
BMI and expenses display a good correlation too.
Let us further use these to see how they perform to predict medical expenses.

In [None]:
sns.pairplot(df)

In [None]:
X = df[['bmi','age','smoker','children']]
Y = df['expenses']

In [None]:

X_train,X_test,y_train,y_test = sklearn.model_selection.train_test_split(X,Y,test_size=0.25)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
regressor = DecisionTreeRegressor(random_state=0)
#cross_val_score(regressor, X_train, y_train, cv=10)
regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
mse_dt = mean_squared_error(y_test,y_predict,squared=False)
print(mse_dt)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
#cross_val_score(regressor, X_train, y_train, cv=10)
regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
mse_dt = mean_squared_error(y_test,y_predict,squared=False)
print(mse_dt)

In [None]:
import pickle
pickle.dump(regressor, open('model.pkl','wb'))

model = pickle.load(open('model.pkl','rb'))

In [None]:
!pip install gradio
import gradio as gr

def greet(bmi,age,smoker,children):
    if smoker:
        is_smoker = 1
    else: 
        is_smoker = 0
    X_test = pd.DataFrame.from_dict({'bmi':[bmi],'age':[age],'smoker':[is_smoker],'children':[children]}) 
    print(X_test)
    y_predict = regressor.predict(X_test)
    print(y_predict)
    return y_predict[0]     

iface = gr.Interface(
  fn=greet, 
  inputs=['text','text','checkbox','text'], 
  outputs="number")
iface.launch(share=True)

In [None]:
greet(23,23,1,4)