# **Loading data**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')

# **Exploring Data**

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['region'].value_counts()

* **Check out the Missing Values**

In [None]:
data.dropna(inplace=True)
print(data.isnull().sum())   

# **preprocessing data**

* **transform categorical data**

In [None]:
from sklearn.preprocessing import LabelEncoder
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

In [None]:
data['sex_encoded'] = le_sex.fit_transform(data.sex)
data['smoker_encoded'] = le_smoker.fit_transform(data.smoker)
data['region_encoded'] = le_region.fit_transform(data.region)

In [None]:
print(data.head(5) , "\n")

* **One hot encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe_region = OneHotEncoder()
arr_ohe_region = ohe_region.fit_transform(data.region_encoded.values.reshape(-1,1)).toarray()

**Convert array OHE to dataframe and append to existing dataframe**

In [None]:
dfOneHot = pd.DataFrame(arr_ohe_region, columns=['region_'+str(i) for i in range(arr_ohe_region.shape[1])])
data = pd.concat([data, dfOneHot], axis=1)


In [None]:
preprocessed_data = data.drop(['sex','smoker','region','region_encoded','region_0'], axis=1)

print(preprocessed_data.head())

*** feature scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
standard_x = StandardScaler()
x_train = standard_x.fit_transform(preprocessed_data)
x_test = standard_x.fit_transform(preprocessed_data)

In [None]:
#After Feature Scaling all values comes into same scale
print(preprocessed_data.info())

# **data split**

In [None]:
from sklearn.model_selection import train_test_split
variables = ['age','sex_encoded','bmi','children','smoker_encoded','region_1','region_2','region_3']
X = preprocessed_data[variables]
sc = StandardScaler()
X = sc.fit_transform(X) 
Y = data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=44)

# **Apply Models**

# Model 1 ->Decision Tree Regressor

In [None]:

from matplotlib import pyplot as plt




from sklearn.ensemble import ExtraTreesRegressor






from sklearn.model_selection import cross_val_score

import statsmodels.formula.api as sm

import seaborn as sns
import warnings

In [None]:
from sklearn.tree import DecisionTreeRegressor
Model_1 = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=10,random_state=33)



In [None]:
Model_1.fit(X_train, y_train)

In [None]:
Model_1.score(X_test,y_test)


In [None]:
from sklearn.metrics import mean_absolute_error
y_pred = Model_1.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error 
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.metrics import median_absolute_error
median_absolute_error(y_test, y_pred)

#  Model 2 -> Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
Model_2 = LinearRegression(fit_intercept=True, normalize=True,copy_X=True,n_jobs=-1)


In [None]:
Model_2.fit(X_train , y_train)

In [None]:
Model_2.score(X_test,y_test)

In [None]:
y_pred = Model_2.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
median_absolute_error(y_test, y_pred)

# Model 3 -> Knn 

In [None]:
from sklearn import neighbors
Model_3 = neighbors.KNeighborsRegressor(n_neighbors = 10 , weights ='distance' ,p=2, metric='minkowski')

In [None]:
Model_3.fit(X_train , y_train)

In [None]:
Model_3.score(X_test,y_test)

In [None]:
y_pred = Model_2.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
median_absolute_error(y_test, y_pred)