# importing libraries

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# reading dataset

In [40]:
dataset=pd.read_csv('insurance.csv')

# checking header level details

In [41]:
print(dataset.head())
print(type(dataset))

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
<class 'pandas.core.frame.DataFrame'>


# checking missing values if any

In [42]:
print(dataset.isnull().sum())
print(type(dataset))

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>


# categorial encoding(Label Encoding)

In [43]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
dataset['sex']=le.fit_transform(dataset['sex'])
dataset['smoker']=le.fit_transform(dataset['smoker'])

In [44]:
print(dataset)
print(type(dataset))

      age  sex     bmi  children  smoker     region      charges
0      19    0  27.900         0       1  southwest  16884.92400
1      18    1  33.770         1       0  southeast   1725.55230
2      28    1  33.000         3       0  southeast   4449.46200
3      33    1  22.705         0       0  northwest  21984.47061
4      32    1  28.880         0       0  northwest   3866.85520
...   ...  ...     ...       ...     ...        ...          ...
1333   50    1  30.970         3       0  northwest  10600.54830
1334   18    0  31.920         0       0  northeast   2205.98080
1335   18    0  36.850         0       0  southeast   1629.83350
1336   21    0  25.800         0       0  southwest   2007.94500
1337   61    0  29.070         0       1  northwest  29141.36030

[1338 rows x 7 columns]
<class 'pandas.core.frame.DataFrame'>


# splitting into X and y

In [46]:
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

# categorial encoding (OneHotEncoder)

In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[5])],remainder='passthrough')
X=ct.fit_transform(X)

In [52]:
print(X.shape)
print(X)

(1338, 9)
[[0.0 0.0 0.0 ... 27.9 0 1]
 [0.0 0.0 1.0 ... 33.77 1 0]
 [0.0 0.0 1.0 ... 33.0 3 0]
 ...
 [0.0 0.0 1.0 ... 36.85 0 0]
 [0.0 0.0 0.0 ... 25.8 0 0]
 [0.0 1.0 0.0 ... 29.07 0 1]]


In [53]:
print(y.shape)

(1338,)


# splitting into train and test sets

In [54]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=7)


# building model with Linear Regression

In [55]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [56]:
y_pred=lr.predict(X_test)

# checking accuracy of the model built

In [57]:
from sklearn.metrics import r2_score
acc=r2_score(y_test,y_pred)
print(acc)

0.7509741262661104


# building model with polynomial regression

## checking shape of X and y for building the model

In [58]:
print(X.shape)

(1338, 9)


In [59]:
print(y.shape)

(1338,)


# hypothesis testing with X_train

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
acc_dict={}
for i in range(2,8,1):
    poly=PolynomialFeatures(degree=i)
    new_X=poly.fit_transform(X_train)
    lr=LinearRegression()
    lr.fit(new_X,y_train)
    y_pred=lr.predict(new_X)
    acc_dict[i]=r2_score(y_train,y_pred)
    

In [61]:
print(acc_dict)

{2: 0.8526009520121021, 3: 0.8657809464287982, 4: 0.881342916784969, 5: 0.9150064473796354, 6: 0.9526504047991611, 7: 0.9642151773849016}


# so for degree=5, getting highest accuracy so will apply polynomial regression for n=5

In [62]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=5)
degree_5=poly.fit_transform(X_train)
lr=LinearRegression()
lr.fit(degree_5,y_train)
y_pred=lr.predict(degree_5)
acc=r2_score(y_train,y_pred)
print(acc)

0.9150064473796354


# predicting for a new user input

In [63]:
print(dataset[0:1,0:-1])

[[ 0.   0.   0.   1.  19.   0.  27.9  0.   1. ]]


In [66]:
new_data=input('enter age,sex,bmi,children,smoker,region in order ')


enter age,sex,bmi,children,smoker,region in order 19,female,27.9,0,yes,southwest


In [67]:
age,sex,bmi,children,smoker,region=new_data.split(',')

In [68]:
print(age)

19


In [69]:
age=int(age)
bmi=float(bmi)
children=int(children)
if sex.lower()=='male':
    sex=1
else:
    sex=0
if smoker.lower()=='yes':
    smoker=1
else:
    smoker=0

In [70]:
data=[[age,sex,bmi,children,smoker,region],]

In [71]:
data=ct.transform(data)

In [72]:
res=lr.predict(poly.transform(data))

In [73]:
print(res)

[16280.47728157]
