In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df['sex'] = df['sex'].map({'male':0,'female':1})
df['smoker'] = df['smoker'].map({'yes':1,'no':0})
df['region'] = df['region'].map({'southwest':1,'southeast':2,'northwest':3,'northeast':4})

In [5]:
x = df.drop('charges',axis=1)
y = df['charges']

In [6]:
print(x.shape)
print(y.shape)

(1338, 6)
(1338,)


In [7]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.900,0,1,1,16884.92400
1,18,0,33.770,1,0,2,1725.55230
2,28,0,33.000,3,0,2,4449.46200
3,33,0,22.705,0,0,3,21984.47061
4,32,0,28.880,0,0,3,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,3,10600.54830
1334,18,1,31.920,0,0,4,2205.98080
1335,18,1,36.850,0,0,2,1629.83350
1336,21,1,25.800,0,0,1,2007.94500


In [8]:
y

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## Using Sklearn's Linear Regression

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [11]:
print(X_train.shape)
print(X_test.shape)

(1070, 6)
(268, 6)


In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
lr = LinearRegression()

In [14]:
lr.fit(X_train,y_train)

In [15]:
y_pred = lr.predict(X_test)

In [16]:
from sklearn.metrics import r2_score

In [17]:
r2_score(y_test,y_pred)

0.7445422986536503

In [18]:
lr.coef_

array([  251.36689613,    35.4338166 ,   330.76133485,   589.05862101,
       23905.96516848,   323.62760276])

In [19]:
lr.intercept_

np.float64(-13077.411399731282)

## Making our own Linear Regression Class

In [20]:
class MeraLR:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self,X_train,y_train):
        X_train = np.insert(X_train,0,1,axis=1)

        # calcuate the coeffs
        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self,X_test):
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred

In [21]:
lr = MeraLR()

In [22]:
lr.fit(X_train,y_train)

In [23]:
X_train.shape

(1070, 6)

In [24]:
np.insert(X_train,0,1,axis=1).shape

(1070, 7)

In [25]:
y_pred = lr.predict(X_test)

In [26]:
r2_score(y_test,y_pred)

0.7445422986536504

In [27]:
lr.coef_

array([  251.36689613,    35.4338166 ,   330.76133485,   589.05862101,
       23905.96516848,   323.62760276])

In [28]:
lr.intercept_

np.float64(-13077.41139973153)