In [55]:
# Import the modules
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression


import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Import relevant metrics from scikit-learn: score, r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Read in the data from our RDBMS or our relational database

In [56]:
#for using a dbms system
Base = automap_base()

connection = psycopg2.connect(user="postgres", password="Corey1234",
                                  host="127.0.0.1", port="5432", database="Project_four")
cursor = connection.cursor()

    # Query all passengers
cost_data = """select inscosts.age, inscosts.bmi, inscosts.children, inscosts.charges
                from InsCosts
                    Where inscosts.smoker = 1
                        group by inscosts.age, inscosts.bmi,inscosts.children,inscosts.charges
                    Order by inscosts.age ASC;
"""
cursor.execute(cost_data)
results = cursor.fetchall()


In [57]:
#age	bmi	children	smoker	charges
ages = []
bmi = []
children = []
charges = []
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        #need to be all ints for a regression
        ages.append(x[0])
        bmi.append(int(x[1]))
        children.append(x[2])
        charges.append(int(x[3]))

Using the DBMS system to get information
Smoker information first

In [58]:
smoker_dc = {}
# create the data frame based on our lists 
convert_charges = np.array(charges, dtype=float)
smoker_dc = {"age": ages, "bmi":bmi,"children":children, "charges": convert_charges}
smoker_df = pd.DataFrame(smoker_dc)
smoker_df.head()


Unnamed: 0,age,bmi,children,charges
0,18,17,2,12829.0
1,18,21,0,13747.0
2,18,21,0,14283.0
3,18,25,0,15518.0
4,18,27,3,18223.0


In [59]:
df = smoker_df.copy()
y = df["charges"]
X = df.drop(columns="charges")

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(205, 3)

Next put the data into our machine learning model

In [61]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Model Accuracy


We want to visualize the data to see if there is a positive or negative linear progression of the data

In [62]:
# Create a model with scikit-learn
model = LinearRegression()
# Fit the data into the model
model.fit(X, y)

In [63]:
# Display the slope
print(f"Model's slope: {model.coef_}")
print(f"Model's y-intercept: {model.intercept_}")
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} x age + {model.coef_[1]} x bmi + {model.coef_[2]} x number of children ")

Model's slope: [ 267.87976045 1435.15908046  198.5216871 ]
Model's y-intercept: -21815.675693970974
Model's formula: y = -21815.675693970974 + 267.87976044571946 x age + 1435.1590804641326 x bmi + 198.52168709857156 x number of children 


This formula means: the predicted health care charges for a smoker would be: $-21815.68 + 267.88(age >=18) + $1435.16(bmi>=0) + $198.52(children>=0)
When a smoker is older, has a high bmi, and many children we would expect to see a high amount of health care charges per this model.

In [64]:
predicted_y_values = model.predict(X)

# Add a column with the predicted sales values
predict_y = []  #make them ints not doubles
for i in predicted_y_values:
    predict_y.append(int(i))

df_smoker2_predicted = X.copy()
df_smoker2_predicted["costs_predicted"] = predict_y

# Display sample data
df_smoker2_predicted.head()

Unnamed: 0,age,bmi,children,costs_predicted
0,18,17,2,7800
1,18,21,0,13144
2,18,21,0,13144
3,18,25,0,18885
4,18,27,3,22351


put the best fit linear line into our scatter plot or real data from kaggle.

In [65]:
# Display the formula
print("For a fifty year old person with BMI 30 and 2 children the formula would look like so:\n")

print(f"Model's formula: y = {model.intercept_:.2f} + {model.coef_[0]:.2f} x 50 + {model.coef_[1]:.2f} x 30 + {model.coef_[2]:.2f} x 2 ")

y_50 = model.intercept_ + (model.coef_[0] * 50) + (model.coef_[1] * 30) + (model.coef_[2] * 2)

# Display the prediction
print("\nThe 50 year old smoker with a BMI of 30 and 2 kids would accumulate charges of:\n")
print(f" ${y_50:.2f}")

For a fifty year old person with BMI 30 and 2 children the formula would look like so:

Model's formula: y = -21815.68 + 267.88 x 50 + 1435.16 x 30 + 198.52 x 2 

The 50 year old smoker with a BMI of 30 and 2 kids would accumulate charges of:

 $35030.13


In [66]:
print("For a forty year old person with BMI 20 and 1 child the formula would look like so:\n")
print(f"Model's formula: y = {model.intercept_:.2f} + {model.coef_[0]:.2f} x 40 + {model.coef_[1]:.2f} x 20 + {model.coef_[2]:.2f} x 1 ")



y_40 = model.intercept_ + (model.coef_[0] * 40) + (model.coef_[1] * 20) + (model.coef_[2] * 1)
print("\nThe 40 year old smoker with a BMI of 20 with 1 kids would accumulate charges of:\n")

print(f" ${y_40:.2f}")

For a forty year old person with BMI 20 and 1 child the formula would look like so:

Model's formula: y = -21815.68 + 267.88 x 40 + 1435.16 x 20 + 198.52 x 1 

The 40 year old smoker with a BMI of 20 with 1 kids would accumulate charges of:

 $17801.22


Evaluate the Linear Regression Model

In [67]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.7541929030304663.
The r2 is 0.7541929030304663.
The mean squared error is 32624032.377295997.
The root mean squared error is 5711.745125379458.
The standard deviation is 11520.507496309401.
