In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
X = df.drop(columns = 'charges')
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [5]:
y = df['charges']
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [6]:
df['region'].value_counts()
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [7]:
# X = pd.get_dummies(X, columns = ['sex', 'region'], drop_first = True)
X['smoker'] = X['smoker'].map({'yes': 1, 'no':0})
X['region'] = X['region'].map({'southeast': 0, 'southwest':1, 'northwest':2 , 'northeast':3})
X['sex'] = X['sex'].map({'male': 1, 'female':0})



In [8]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,1
1,18,1,33.77,1,0,0
2,28,1,33.0,3,0,0
3,33,1,22.705,0,0,2
4,32,1,28.88,0,0,2


In [9]:
from sklearn.preprocessing import RobustScaler
sc = RobustScaler()
X[['age','bmi']] = sc.fit_transform(X[['age','bmi']])
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-0.833333,0,-0.297708,0,1,1
1,-0.875,1,0.40131,1,0,0
2,-0.458333,1,0.309616,3,0,0
3,-0.25,1,-0.916344,0,0,2
4,-0.291667,1,-0.181006,0,0,2


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [13]:
print('shape of X_train is: ', X_train.shape)
print('shape of X_test is: ', X_test.shape)
print('shape of y_train is: ',y_train.shape)
print('shape of y_test is: ',y_test.shape)

shape of X_train is:  (936, 6)
shape of X_test is:  (402, 6)
shape of y_train is:  (936,)
shape of y_test is:  (402,)


In [15]:
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree = 3)

In [21]:
X_train_poly = pf.fit_transform(X_train)
X_test_poly = pf.transform(X_test)

In [22]:
X_train_poly

array([[ 1.        , -0.875     ,  0.        , ...,  0.        ,
         0.        , 27.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        ,  0.54166667,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  0.04166667,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -0.83333333,  1.        , ...,  0.        ,
         0.        ,  8.        ],
       [ 1.        , -0.25      ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [23]:
X_test_poly

array([[1.        , 0.54166667, 1.        , ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.33333333, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.375     , 1.        , ..., 2.        , 4.        ,
        8.        ],
       ...,
       [1.        , 0.66666667, 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.04166667, 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.95833333, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [24]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_poly , y_train)

In [25]:
y_pred = lr.predict(X_test_poly)

In [26]:
y_pred

array([11466.13484703, 10051.2341496 , 48452.85528553, 13416.64645263,
       12760.99641691,  5995.92833487,  3245.79772741, 13931.18217551,
        8677.43097014,  7095.86026149,  8443.56991068, 11080.67141724,
        8377.25013341,  6452.17528376, 25098.5675326 , 12910.5219518 ,
       13952.83002454,  7870.00426782,  7913.26449089, 26536.15840035,
       26156.03312315, 14965.85021263, 11036.06710099, 28643.09085462,
        4907.47545615,  9601.68525758,  6864.62336749,  8419.85026077,
        5603.36006436, 10200.23466172,  8774.63006342, 52649.90973831,
       13638.53768324, 11708.01638512, 13779.61341626,  5605.4068797 ,
       10260.53170783, 37512.84644185, 38187.75296895,  2028.31695346,
        7456.70428393,  4669.61291484, 25300.57912008, 47264.77216652,
       34918.52488744,  6867.40745289, 12905.60457858,  8186.62494294,
        7624.37253657, 15155.69852984,  5217.64156209,  7225.20196886,
       31080.81380256, 49709.45677079, 12115.30514506,  6068.05374776,
      

In [27]:
train_acc = lr.predict(X_train_poly)
train_acc

array([ 3403.34288617,  6668.64722198, 11522.34745159,  8069.05824889,
        6221.14644725,  1684.59350529, 32151.51969513, 13673.38685252,
        7780.73562215, 12716.90799683,  9994.23965932, 12649.15719082,
       50642.85045004,  6104.90387599, 10373.5039034 ,  4033.59420935,
       18010.75273815,  5204.29106456,  7598.35610184,  4412.66287637,
       12813.09604392,  7502.23560674, 14279.33573043, 37985.93464568,
        6425.95551921, 29114.7308649 , 15632.63358176, 11140.23400826,
        8650.4203742 , 45347.99067898, 10005.84812604,  4048.39712903,
        8849.77051499, 17279.21038674, 35620.8398642 , 23460.27437584,
       10054.70353876, 36734.37126566,  3421.49789655, 19535.30451137,
        3648.68440562, 35058.41983735,  5931.95091185, 16049.604404  ,
        2547.30243161, 37846.24492439,  9442.11983299,  4789.75930193,
        4457.2950954 , 10100.94307789, 11170.82888447,  8916.98086938,
       12099.00908074,  5456.57288591,  2320.1809666 ,  4319.33329517,
      

In [28]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8675861156970905

In [29]:
sc.transform([[24,53.38763]])



array([[-0.625     ,  2.73743733]])

In [31]:
for var in range(1,5):
    pf = PolynomialFeatures(degree=var)
    X_train_poly = pf.fit_transform(X_train)
    X_test_poly = pf.transform(X_test)
    lr = LinearRegression()
    model = lr.fit(X_train_poly,y_train)
    y_pred = lr.predict(X_test_poly)
    print(r2_score(y_test,y_pred))

0.7910886918703965
0.87567247336226
0.8675861156970905
0.8178584592893029
