# Linear Regression 2

### Initialize global variables

In [1]:
global random_state
random_state = 42

### Import libraries

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Load data
Columns
* age: age of primary beneficiary
* sex: female=0, male=1
* bmi: body mass index
* children: # of children covered by health insurance
* smoker: yes=1, no=0
* region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
* charges: individual medical costs billed by health insurance

In [3]:
# Read data file (insurance_rates.csv) into a dataframe
df = pd.read_csv('data/insurance_rates.csv')

### Examine data

In [4]:
df.shape

(1338, 7)

In [5]:
df.columns

Index(['age', 'gender', 'smoker', 'bmi', 'children', 'region', 'charges'], dtype='object')

In [6]:
df.dtypes

age           int64
gender        int64
smoker        int64
bmi         float64
children      int64
region       object
charges     float64
dtype: object

In [7]:
df.head()

Unnamed: 0,age,gender,smoker,bmi,children,region,charges
0,19,0,1,27.9,0,southwest,16884.924
1,18,1,0,33.77,1,southeast,1725.5523
2,28,1,0,33.0,3,southeast,4449.462
3,33,1,0,22.705,0,northwest,21984.47061
4,32,1,0,28.88,0,northwest,3866.8552


In [8]:
df.tail()

Unnamed: 0,age,gender,smoker,bmi,children,region,charges
1333,50,1,0,30.97,3,northwest,10600.5483
1334,18,0,0,31.92,0,northeast,2205.9808
1335,18,0,0,36.85,0,southeast,1629.8335
1336,21,0,0,25.8,0,southwest,2007.945
1337,61,0,1,29.07,0,northwest,29141.3603


### Prepare data for model training

In [9]:
# Drop non-numeric columns: region
df.drop('region', axis=1, inplace=True)
df.head()

Unnamed: 0,age,gender,smoker,bmi,children,charges
0,19,0,1,27.9,0,16884.924
1,18,1,0,33.77,1,1725.5523
2,28,1,0,33.0,3,4449.462
3,33,1,0,22.705,0,21984.47061
4,32,1,0,28.88,0,3866.8552


### Separate independent and dependent variables

In [10]:
# Independent variables: All columns except 'charges'
X = df.drop('charges', axis = 1)

# Dependent variable: charges
y = df['charges']

In [11]:
X.shape

(1338, 5)

In [12]:
type(X)

pandas.core.frame.DataFrame

In [13]:
X.head()

Unnamed: 0,age,gender,smoker,bmi,children
0,19,0,1,27.9,0
1,18,1,0,33.77,1
2,28,1,0,33.0,3
3,33,1,0,22.705,0
4,32,1,0,28.88,0


In [14]:
y.shape

(1338,)

In [15]:
type(y)

pandas.core.series.Series

In [16]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

### Split data into training and test sets

In [17]:
# 70% training set, 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=random_state)

### Train Linear Regression model

In [18]:
# Instantiate LinearRegression model
model = LinearRegression()

In [19]:
# Fit LinearRegression model
model.fit(X_train, y_train)

### Calculate model performance for training and test sets

In [20]:
# Calculate model performance for training set
y_train_predict = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_predict)
rmse = np.sqrt(mse)

print("Model performance for training set")
print("----------------------------------")
print("MSE is {}".format(round(mse,2)))
print("RMSE is {}".format(round(rmse,2)))

Model performance for training set
----------------------------------
MSE is 37878481.86
RMSE is 6154.55


In [21]:
# Calculate model performance for test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Model performance for test set")
print("------------------------------")
print("MSE is {}".format(round(mse,2)))
print("RMSE is {}".format(round(rmse,2)))

Model performance for test set
------------------------------
MSE is 34003912.39
RMSE is 5831.29


### Review model intercept and coefficients

In [22]:
model.intercept_

-12538.439849853134

In [23]:
model.coef_

array([  261.91061673,   136.65119758, 23618.76182167,   333.36099462,
         432.1792927 ])

In [24]:
min_values = X.min()
min_values

age         18.00
gender       0.00
smoker       0.00
bmi         15.96
children     0.00
dtype: float64

In [25]:
max_values = X.max()
max_values

age         64.00
gender       1.00
smoker       1.00
bmi         53.13
children     5.00
dtype: float64

### Execute model with new independent variable values
Values set by professor

In [26]:
age_1 = 31
gender_1 = 1
smoker_1 = 1
bmi_1 = 30
children_1 = 2

predicted_charges = model.intercept_ + ( model.coef_[0] * age_1 ) + (model.coef_[1] * gender_1 ) + \
        (model.coef_[2] * smoker_1 ) + (model.coef_[3] * bmi_1 ) + (model.coef_[4] * children_1 )
print('Predicted charges =', round(predicted_charges,2))

Predicted charges = 30201.39


In [27]:
age_2 = 46
gender_2 = 0
smoker_2 = 0
bmi_2 = 23
children_2 = 1

predicted_charges = model.intercept_ + ( model.coef_[0] * age_2 ) + (model.coef_[1] * gender_2 ) + \
        (model.coef_[2] * smoker_2 ) + (model.coef_[3] * bmi_2 ) + (model.coef_[4] * children_2 )
print('Predicted charges =', round(predicted_charges,2))

Predicted charges = 7608.93
