In [None]:
# Import libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

print(os.listdir("../input"))

In [None]:
# Read the dataset
df = pd.read_csv("../input/insurance.csv")

# Have a first look at the dataset
df.head(5)

In [None]:
# Know the numbers of rows and columns
df.shape

In [None]:
# Know the names of the columns
df.columns

In [None]:
# Get more information about the dataset
# Know the datatypes of the columns
df.info()

In [None]:
# Check if there are missing values
missing_values = (df.isna().sum())
print(missing_values)

In [None]:
# Transform categorical data into numerical data
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

# Transform the dataype of sex from object to int64
le.fit(df.sex.drop_duplicates()) 
df.sex = le.transform(df.sex)
# Transform the dataype of smoker from object to int64
le.fit(df.smoker.drop_duplicates()) 
df.smoker = le.transform(df.smoker)
# Transform the dataype of region from object to int64
le.fit(df.region.drop_duplicates()) 
df.region = le.transform(df.region)

In [None]:
# Look at statistical details of the data
df.describe()

In [None]:
# Know the correlations between the columns
df.corr()

In [None]:
df.corr()['charges'].sort_values()

In [None]:
# Create features (x) and targets (y)
x = df[['age']]
y = df[['charges']]

print(x.head(5))
print(y.head(5))

In [None]:
# Split the data in training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=101)

print(x_train.shape) 
print(y_train.shape) 
print(x_test.shape)
print(y_test.shape)

In [None]:
# Train the model 
from sklearn.linear_model import LinearRegression 
regr = LinearRegression()
regr = regr.fit(x_train, y_train) 
y_pred = regr.predict(x_test)

# Plot outputs
plt.scatter(x_test, y_test, color='black') 
plt.title('Medical Costs') 
plt.xlabel('Age') 
plt.ylabel('Charges') 
plt.plot(x_test, regr.predict(x_test), color='red',linewidth=3) 
plt.show() 

In [None]:
# Know the coefficient 
print(regr.coef_)

In [None]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  