In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# imports
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree

In [None]:
# read data
df = pd.read_csv("../input/insurance/insurance.csv")
df.head()

In [None]:
# shape of data
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# check for null values
df.isnull().sum()

In [None]:
factors = ['age', 'sex', 'bmi', 'children', 'charges']
df.groupby('sex')[factors].mean()

In [None]:
# plot value counts of sex
plt.figure(figsize=(8,8))
sex_count = df['sex'].value_counts()
sns.countplot(sex_count, palette='coolwarm')
plt.title("Value counts of sex")
plt.show()

In [None]:
# plot relationship between region and charges
plt.figure(figsize=(8,8))
sns.boxplot(x='region', y='charges', data=df, palette='coolwarm')
plt.title("Relationship between region and charges")
plt.show()

In [None]:
# plot relationship between age and smoker colour encoded by sex
plt.figure(figsize=(15,8))
sns.barplot(x=df.age, y=df.smoker, hue=df.sex, palette='coolwarm')
plt.title("Relationship between age and smoker colour encoded by sex")
plt.show()

In [None]:
# plot relationship between sex and charges
plt.figure(figsize=(8,8))
sns.barplot(x=df.sex, y=df.charges, palette='coolwarm')
plt.title("Relationship between sex and charges")
plt.show()

In [None]:
# encode labels to numeric form
label_encoder = LabelEncoder()
df.sex = label_encoder.fit_transform(df.sex)
df.smoker = label_encoder.fit_transform(df.smoker)
df.region = label_encoder.fit_transform(df.region)

In [None]:
df_factors = df[['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']]

In [None]:
# visualize correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(data=df_factors.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation matrix")
plt.show()

In [None]:
# split data into features and output labels
X = df.drop(['charges'], axis=1)
y = df.charges
print("X shape: ", X.shape)
print("y shape: ", y.shape)

In [None]:
# split into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X train shape: ", X_train.shape)
print("y train shape: ", y_train.shape)
print("X test shape: ", X_test.shape)
print("y test shape: ", y_test.shape)

**Decision Tree Regressor**

In [None]:
decision_tree_model = tree.DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
tree.plot_tree(decision_tree_model, fontsize=10)
plt.show()

In [None]:
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
 
dot_data = export_graphviz(decision_tree_modelclf_tree, filled=True, rounded=True,
                                    class_names=['Setosa',
                                                'Versicolor',
                                                'Virginica'],
                                    feature_names=['petal length',
                                                'petal width'],
                                    out_file=None)
graph = graph_from_dot_data(dot_data)

In [None]:
y_pred_decision_tree_model = decision_tree_model.predict(X_test)

In [None]:
decision_tree_plot = tree.plot_tree(clf, 
                   feature_names=iris.feature_names,  
                   class_names=iris.target_names,
                   filled=True)


In [None]:
print("Training accuracy: ", decision_tree_model.score(X_train, y_train))
print("Testing accuracy: ", decision_tree_model.score(X_test, y_test))

**Random Forest Regressor**

In [None]:
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train, y_train)

In [None]:
y_pred_random_forest_model = random_forest_model.predict(X_test)

In [None]:
print("Training accuracy: ", random_forest_model.score(X_train, y_train))
print("Testing accuracy: ", random_forest_model.score(X_test, y_test))

In [None]:
sns.pairplot(data=df, hue='charges', palette='coolwarm')

In [None]:
# plotting line of fit
fig, axes = plt.subplots(2, 3, figsize=(20, 15))
fig.suptitle("Plotting line of fit")
sns.regplot(ax=axes[0,0], x=df.age, y=df.charges)
sns.regplot(ax=axes[0,1], x=df.sex, y=df.charges)
sns.regplot(ax=axes[0,2], x=df.bmi, y=df.charges)
sns.regplot(ax=axes[1,0], x=df.children, y=df.charges)
sns.regplot(ax=axes[1,1], x=df.smoker, y=df.charges)
sns.regplot(ax=axes[1,2], x=df.region, y=df.charges)
plt.show()