In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline   
# this is important to embedd the plot inside the notebook.

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head() # by default 5 rows will come

In [None]:
#to check the bin of every feature value.
df['age'].value_counts()

In [None]:
#to check the mising values of every feature.
df.isnull().sum()

In [None]:
# to check the overall missing values in df
df.isnull().sum().sum()

In [None]:
#df shows the complete data frame
#df.info shows the complete list of records - alignment is haywired
# df.info() gives the column names with total number of records in each column- null/not null values and the data type
df.info()

In [None]:
#df.describe() shows the complete decsriptipe stats of all the numerical features of the dataset
df.describe()

In [None]:
# if you want to see the columns of the data set
df.columns

In [None]:
# Exploratory Data analysis (EDA)
# use plots 

In [None]:
#Pairplot - to see the relation between all the features.
sns.pairplot(df)

In [None]:
#distribution plot 

sns.distplot(df['charges'])

In [None]:
#using heatmap
#df.corr() gets the corrrelation within itself of a dataframe.
#df.corrwith() gets the correlation between rows and columns between two dataframes.
sns.heatmap( df.corr())

In [None]:
# since we are doing Linear Regression we will have to drop the categorical variables which are not continous.
# Therefore we will only look out for continous independent variable which are age, bmi and children.

In [None]:
# Splitting the data in dependent and independent features.
#independent feature - Charges
y = df['charges']
#dependent variables - age,bmi,children
X = df[['age','bmi','children']]

In [None]:
# Splitting the data in tarining and testing test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=100)

In [None]:
# Apply the linear model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
#fit all the data points into the regression line
lm.fit(X_train,y_train)

In [None]:
# find the intercept
df_intercept = lm.intercept_
print(df_intercept)

In [None]:
#find the coefficients
df_coeff = lm.coef_
df_coef = pd.DataFrame(df_coeff,X.columns, columns=['Coefficient'])
df_coef

In [None]:
# Now that we have intercept and coefficients value we can predict the values from test data set
predictions= lm.predict(X_test)

In [None]:
plt.scatter(y_test, predictions)

In [None]:
sns.distplot(y_test - predictions)  # dist for errors

In [None]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
lm.score(X,y)  # gives the Rsquare value

In [None]:
sns.distplot(y) 