In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading the dataset

In [None]:
df = pd.read_csv("/kaggle/input/insurance-premium-prediction/insurance.csv")

In [None]:
#exploring the size of dataset

In [None]:
df.shape

In [None]:
#viewing the head of dataset

In [None]:
df.head(25)

In [None]:
#getting information about the dataset

In [None]:
df.info()

In [None]:
#descrbing the dataset/statistical view on the dataset

In [None]:
df.describe(include = "all")

In [None]:
#looking for null values in the dataset

In [None]:
df.isna().sum()

In [None]:
#checking for duplicates
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep='first')
df

In [None]:
df.shape

In [None]:
#data preprocessing

In [None]:
#1.data_cleaning

In [None]:
#getting column headings
df.columns

In [None]:
#first we separate the features/columns with non-numerical values to encode it.

df_category_columns = df.select_dtypes(exclude = np.number).columns

In [None]:
df_category_columns

In [None]:
# we go one by one on the features and see the data it contains
df["sex"].value_counts()

In [None]:
#encoding simply by replacing the catagory values into numerical for model
df["sex"] = df["sex"].replace({"male":0,"female":1})

In [None]:
#encoded feature
df["sex"].head()

In [None]:
#next feature to be encoded is "smoker"
df["smoker"].value_counts()

In [None]:
#encoding simply by replacing the catagory values into numerical for model
df["smoker"] = df["smoker"].replace({"yes":1,"no":0})
df['smoker']

In [None]:
#next feature to be encoded is "smoker"
df["region"].value_counts()

In [None]:
#Import library:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#New variable for outlet
var_mod = ['region']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])

In [None]:
df['region']

In [None]:
df

In [None]:
df= pd.get_dummies(df, columns=["region"])

In [None]:
df

In [None]:
#checking whether all the features consist only numerical data by calling the features consist only numerical values
df_number_columns = df.select_dtypes(include = np.number).columns
df_number_columns

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
#data_exploration/overview of data graphically
sns.pairplot(df)

#insights from the above graph

#1.the expenses ranges between 1k to 60k

#2.people with bmi range between 25 to 45 have max expenses

#3.people with 0 children have more expense

#4.people who doesnt smoke have high expenses

#5.only few have higher expenses

In [None]:
#Check for Multi Collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# the independent variables set 
X = df.select_dtypes(include=np.number).drop(columns=["expenses"])
  
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
#since we have already reduced features so it may not be neccessary to ommit the features or we can drop the feature with VIF more than 10
del df['region_2']

In [None]:
df

In [None]:
#saviong the preprocessed data in new file.csv
df.to_csv("insurance_prem_pred_preprocess.csv",index = False)

In [None]:
#reading the preprocessed data
df_preprocessed = pd.read_csv("insurance_prem_pred_preprocess.csv")

In [None]:
df_preprocessed.head(20)

In [None]:
#now we create a train test slpit to build, validate our model
import sklearn

from sklearn.model_selection import train_test_split

In [None]:
#determing the input/independant features

X = df_preprocessed.drop(columns ="expenses")

#determing the output/dependant/target feature

y = df_preprocessed["expenses"]


In [None]:
#spliting the test data with 33%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [None]:
#error metrics on train data
pred_train = model.predict(X_train)
print("Mean Absolute Error of train data = ",mean_absolute_error(y_train,pred_train))
print("RMSE of train data = ",np.sqrt(mean_squared_error(y_train, pred_train)))
score = cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_root_mean_squared_error" )
print("Cross validation Score  = ",np.mean(np.abs(score)))
print("Mean Absolute Percentage Error of train data = ", mean_absolute_percentage_error(y_train, pred_train))

In [None]:
#error metics on test data
pred_test = model.predict(X_test)
pred_test = abs(pred_test)
print("Mean Absolute Error of train data = ",mean_absolute_error(y_test,pred_test))
print("RMSE of train data = ",np.sqrt(mean_squared_error(y_test,pred_test)))
score = cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_root_mean_squared_error" )
print("Cross validation Score  = ",np.mean(np.abs(score)))
print("Mean Absolute Percentage Error of train data = ", mean_absolute_percentage_error(y_test,pred_test))