In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd
# Reading the data 
df = pd.read_csv('../input/insurance/insurance.csv')
df_original = df.copy()
# Exploring the data
df.head()

In [None]:
# Checking to see if there are null values in the dataset 
df.isnull().sum()

In [None]:
# Checking the data information 
df.info()

In [None]:
np.sum(df.duplicated() == True)

In [None]:
# From the data, sex and smoker attributes are cartegorical data. For us to be able feed them to 
# the scikit-learn for further analysis, we will have to convert them to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['sex'])
df['sex'] = le.transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df.region = le.fit_transform(df.region)

In [None]:
df.head()

In [None]:
# Checking the correlation of the dataset
df.corr()

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns
fig = plt.figure(figsize = (10, 7))
sns.heatmap(df.corr(), annot = True);

In [None]:
pd.DataFrame(df.corr()['charges'].sort_values())

# Exploring the Dataset

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt 
fig, ax = plt.subplots(figsize = (15, 10))
df.hist(ax = ax);

In [None]:
fig, ax = plt.subplots(figsize = (5, 3))
sns.distplot(df[(df.smoker == 1)]['charges'], color = 'c', ax = ax)
ax.set(title = 'Distribution of charges for smokers')

fig, ax = plt.subplots(figsize = (5, 3))
sns.distplot(df[df['smoker']== 0]['charges'], color = 'g', ax = ax)
ax.set(title = 'Distribution of charges for non-smokers');

# Splitting Data 

In [None]:
X = df.drop(['charges', 'region'], axis = 1)
y = df['charges']

In [None]:
X.head()

In [None]:
y.head()

# Model 1: Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

# Using polinomial features to try to improve the effect of the linear regression prediction on our datasets

In [None]:
from sklearn.preprocessing import PolynomialFeatures
a = df.drop(['charges', 'region'], axis = 1)
b = df.charges

quad = PolynomialFeatures(degree = 2)
a_quad = quad.fit_transform(a)
a_train, a_test, b_train, b_test = train_test_split(a_quad, b, random_state = 0)
poly_lr = LinearRegression()
poly_lr.fit(a_train, b_train)
print(poly_lr.score(a_test, b_test))
print(mean_absolute_error(poly_lr.predict(a_test), b_test))

# Model 2 - Lasso

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.01)
lasso.fit(X_train, y_train)
print(lasso.score(X_test, y_test))
print(mean_absolute_error(lasso.predict(X_test), y_test))

# Model 3 - Ridge

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.00001)
ridge.fit(X_train, y_train)
print(ridge.score(X_test, y_test))
print(mean_absolute_error(ridge.predict(X_test), y_test))

# Model 4 - Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 100, random_state = 1)
forest.fit(X_train, y_train)
print(forest.score(X_test, y_test))
print(mean_absolute_error(forest.predict(X_test), y_test))