In [3]:
#linear regression using multiple variables. 
#In this case, rather than plotting a line, you’re plotting a plane in multiple dimensions.
#The phenomenon is still referred to as linear since the data grows at a linear rate.
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression


In [11]:
dataset=pd.read_csv('insurance.csv')
non_smokers=dataset[dataset['smoker']=='no']


In [14]:
x=non_smokers[['age', 'bmi']]
y=non_smokers['charges']


In [15]:
x_train, x_test, y_train, y_test=train_test_split(x, y, shuffle=True, train_size=0.3)

In [16]:
multiVar_model=LinearRegression()
multiVar_model.fit(x_train, y_train)

LinearRegression()

In [19]:
predictions=multiVar_model.predict(x_test)
r2=r2_score(y_test, predictions)
rmse=mean_squared_error(y_test, predictions, squared=False)
print(r2)
print(rmse)

0.3681154395347175
4759.375911432801


In [None]:
'''
predict() must follow fit(). fit() builds a model that tries to find a pattern that maps input 
data to the labels. At this stage the input data is called the training set. 
predict() simply asks your trained model to use those patterns to map new inputs to their labels. 
They are the model's best guess, given what it was previously trained on, not random. 
Optimizing the quality of those patterns so that the predictions are as accurate as possible is 
the whole art and science of machine learning.

Imagine I wanted to make you an expert at identifying cat breed. At first you might have no idea. 
So first I have to train you. I show you a bunch of pictures and tell you what each cat 
breed is (label). After a while, you start to see patterns and you take notes. Eventually you 
start to feel like you get even the finer distinctions and feel ready to identify the cat breed
of any cat shown to you. Your new knowledge is called a model. This is what fit() does. 
It builds a model through training.

Now I start showing you pictures of cats without telling you the breed (label). 
Using your new knowledge, notes and patterns (i.e. your model), you can now "predict" the 
breed of a cat you haven't seen before (provided it's one of the breeds you've learned about 
during training). This is what predict() does. It runs data through the model to get predictions.

'''

In [40]:
#Creating new Variables
dataset['Smoker_int']=dataset['smoker'].map({'yes':1, 'no':0})
x=dataset[['age', 'bmi', 'Smoker_int']]
y=dataset['charges']

In [41]:
x_train, x_test, y_train, y_test=train_test_split(x, y, shuffle=True, train_size=0.3)
multiVar_model2=LinearRegression()
multiVar_model2.fit(x_train, y_train)

LinearRegression()

In [50]:
predictions=multiVar_model2.predict(x_test)
r2=r2_score(y_test, predictions)
rmse=mean_squared_error(y_test, predictions, squared=False)

In [61]:
print(r2)
print(rmse)

0.7557561893584342
5938.276198490477


In [57]:
#If you’re satisfied with the data, you can actually turn the linear model into a function. 
# With this function, you can then pass in new data points to make predictions about what a person’s charges may be.
#The model has two attributes:
 #   .coef_ which stores an array of the coefficients of our model
 #   .intercept_ which stores the y-intercept of our linear model
coefficients=multiVar_model2.coef_
intercept=multiVar_model2.intercept_
print(coefficients)
# 3 coeffs.. because we have 3 dependent variables and 1 independent var 
print(intercept)

[  277.79989422   381.24916954 23001.28195803]
-13867.302318147033


In [62]:
def calculate_charges(age, bmi, smoker):
  return (age * coefficients[0]) + (bmi * coefficients[1]) + (smoker * coefficients[2]) + intercept
  # y = (m1*x1)+(m2*x2)+(m3*x3) + c

In [65]:
#NOW WE CAN MAKE PREDICTIONS
print(calculate_charges(19, 22.9, 0))
# the person would likely have just under $141 of charges of insurance!

141.50165456571813
