# import libraries and load data

In [None]:
# data load and helper library
import numpy as np
import pandas as pd
import os
# visulization and insight
import seaborn as sns
import matplotlib.pyplot as plt
# data preprocessing and encode
from sklearn.preprocessing import LabelEncoder, StandardScaler
# data modeling and prediction
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.svm import SVR

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')

# Exploratory data analysis

In [None]:
data.head()

In [None]:
data.info()

* no null values in data

In [None]:
data.isna().sum()

In [None]:
categorical_columns = ['sex', 'smoker', 'region']
numerical_columns = ['age', 'bmi', 'children', 'charges']

## duplicate row check and remove

In [None]:
print('duplicated : {}'.format(data.duplicated().sum()))
data = data.drop_duplicates(keep = 'last')
print('duplicated after remove: {}'.format(data.duplicated().sum()))

## Outlier check

In [None]:
target = 'charges'
for col in numerical_columns:
    sns.boxplot(data = data, x = col)
    plt.show()

* remove outlier from charges

In [None]:
data = data[data.charges < 50000]
sns.boxplot(data = data, x = 'charges')

## categorical columns

* unique value in categorical columns

In [None]:
data[categorical_columns].nunique().plot(kind = 'bar')

In [None]:
for col in categorical_columns:
    sns.catplot(x = col, data = data, kind = 'count')

In [None]:
for col in categorical_columns:
    data[col] = LabelEncoder().fit_transform(data[col])

## numerical columns

In [None]:
data.describe()

In [None]:
for col in numerical_columns:
    sns.distplot(data[col], rug = True)
    plt.show()

## correlation matrix

In [None]:
sns.heatmap(data.corr())
plt.show()
data.corr()

* age and smoker columns have higher relationn with  charges

In [None]:
sns.jointplot(x = 'age', y = 'charges', data = data)

In [None]:
sns.jointplot(x = 'smoker', y = 'charges', data = data)

In [None]:
sns.catplot(x = 'age', y = 'charges', data = data, hue = 'smoker')

* non-smoker has very low charges compare to smoker
* explore more about smoker and non-smoker

# Data preprocessing

In [None]:
for col in categorical_columns:
    data[col] = LabelEncoder().fit_transform(data[col])

In [None]:
for col in categorical_columns:
    data[col] = data[col].astype('category')
for col in numerical_columns:
    data[col] = pd.to_numeric(data[col])

# Data Modeling

In [None]:
train, charges = data, data['charges']
X_train, X_test, Y_train, Y_test = train_test_split(train, charges, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
performance = {'algo' : [], 'r2_score' : []}

## LinearRegression

In [None]:
lr = LinearRegression().fit(X_train,Y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

r2 = r2_score(y_test_pred, Y_test)
print('r2_score: {}'.format(r2))

In [None]:
performance['algo'].append('LinearRegression')
performance['r2_score'].append(r2)

## RandomForestRegressor

In [None]:
lr = RandomForestRegressor().fit(X_train,Y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

r2 = r2_score(y_test_pred, Y_test)
print('r2_score: {}'.format(r2))

In [None]:
performance['algo'].append('RandomForestRegressor')
performance['r2_score'].append(r2)

In [None]:
performance_df = pd.DataFrame(performance)

In [None]:
sns.plotting_context

In [None]:
sns.catplot(data = performance_df, x = 'algo', y = 'r2_score', kind = 'bar')