In [94]:
# Import the modules
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression


import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Import relevant metrics from scikit-learn: score, r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Read in the data from our RDBMS or our relational database

In [95]:
#for using a dbms system
Base = automap_base()

connection = psycopg2.connect(user="postgres", password="Corey1234",
                                  host="127.0.0.1", port="5432", database="Project_four")
cursor = connection.cursor()

    # Query all passengers
cost_data = """select inscosts.age, inscosts.bmi, inscosts.children, inscosts.charges
                from InsCosts
                    Where inscosts.smoker = 1
                        group by inscosts.age, inscosts.bmi,inscosts.children,inscosts.charges
                    Order by inscosts.age ASC;
"""
cursor.execute(cost_data)
results = cursor.fetchall()


In [96]:
#age	bmi	children	smoker	charges
ages = []
bmi = []
children = []
charges = []
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        #need to be all ints for a regression
        ages.append(x[0])
        bmi.append(int(x[1]))
        children.append(x[2])
        charges.append(int(x[3]))

Using the DBMS system to get information
Smoker information first

In [97]:
smoker_dc = {}
# create the data frame based on our lists 
convert_charges = np.array(charges, dtype=float)
smoker_dc = {"age": ages, "bmi":bmi,"children":children, "charges": convert_charges}
smoker_df = pd.DataFrame(smoker_dc)
smoker_df.head()


Unnamed: 0,age,bmi,children,charges
0,18,17,2,12829.0
1,18,21,0,13747.0
2,18,21,0,14283.0
3,18,25,0,15518.0
4,18,27,3,18223.0


In [98]:
df = smoker_df.copy()
y = df["charges"]
X = df.drop(columns="charges")

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(205, 3)

Next put the data into our machine learning model

In [100]:
sum_cost_data = """select inscosts.bmi, (sum(inscosts.charges) ) as sum_charges
                from InsCosts
                    Where inscosts.smoker = 1
                        group by inscosts.bmi
                    Order by inscosts.bmi ASC;"""
cursor.execute(sum_cost_data)
results = cursor.fetchall()

In [101]:
#age	bmi	children	smoker	charges
bmi = []
charges = []
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        bmi.append(x[0])
        charges.append(x[1])

In [107]:
bmi_dc = {}
# create the data frame based on our lists 
convert_charges = np.array(charges,dtype=float)

#bmi_dc = {"bmi": bmi_measure, "charges": convert_charges}
#bmi_df = pd.DataFrame(bmi_dc)

#bmi_df.head()

In [109]:
bmi_score = np.array(bmi,dtype=float)


In [110]:
bmi_dc = {"bmi": bmi_score, "charges": convert_charges}
bmi_df = pd.DataFrame(bmi_dc)

bmi_df.head()

Unnamed: 0,bmi,charges
0,17.195,14455.64405
1,17.29,12829.4551
2,17.765,32734.1863
3,17.955,15006.57945
4,18.3,19023.26


In [111]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Model Accuracy


In [112]:
#X_test
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.000


This model is unaccpetable and should not be used other than showing what does not work

Using the regression prediction model let us further examine this dataset

Use this data to build our X and y formula

We want to visualize the data to see if there is a positive or negative linear progression of the data

In [121]:
#show the scatterplot
# Create a scatter plot with the sales information
bmi_plot = bmi_df.hvplot.scatter(
    x="bmi",
    y="charges",
    xlim=(15, 50), 
    ylim=(10000, 100000),
    title="Charges per bmi body mass index score"
)
bmi_plot

The data does not look bound and moving in a negative or positive direction... It looks stationary. A bad candidate for modeling any relationship

In [122]:
X = bmi_df["bmi"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[17.195],
       [17.29 ],
       [17.765],
       [17.955],
       [18.3  ]])

In [123]:
y = bmi_df["charges"]
y[:5]

0    14455.64405
1    12829.45510
2    32734.18630
3    15006.57945
4    19023.26000
Name: charges, dtype: float64

Build the model regardless of the bad looking graph

In [124]:
# Create a model with scikit-learn
model = LinearRegression()
# Fit the data into the model
model.fit(X, y)

In [125]:
# Display the slope
print(f"Model's slope: {model.coef_}")
print(f"Model's y-intercept: {model.intercept_}")
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's slope: [1707.99430618]
Model's y-intercept: -11614.362319344677
Model's formula: y = -11614.362319344677 + 1707.99430618145X


Plot the best fit line for the cost model

In [126]:
predicted_y_values = model.predict(X)
df_bmi_predicted = bmi_df.copy()

# Add a column with the predicted sales values
predict_y = []  #make them ints not doubles
for i in predicted_y_values:
    predict_y.append(i)

df_bmi_predicted["costs_predicted"] = predict_y

# Display sample data
df_bmi_predicted.head()

Unnamed: 0,bmi,charges,costs_predicted
0,17.195,14455.64405,17754.599775
1,17.29,12829.4551,17916.859235
2,17.765,32734.1863,18728.15653
3,17.955,15006.57945,19052.675448
4,18.3,19023.26,19641.933484


In [128]:
best_fit_line = df_bmi_predicted.hvplot.line(
    x = "bmi",
    y = "costs_predicted",
    color = "red"
)
best_fit_line

put the best fit linear line into our scatter plot or real data from kaggle... Should not be good in terms of linear regressions

In [129]:
# Superpose the original data and the best fit line
bmi_plot * best_fit_line

Looks like a moderate growth... Probably be an ok r squred.

In [None]:
#Next make the manual predictions

In [131]:
# Display the formula to predi
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 100")

# Predict the sales with 100 ads
y_35 = model.intercept_ + model.coef_[0] * 35

# Display the prediction
print(f"Predicted costs of the 50 year old smoker group: ${y_35:.2f}")

Model's formula: y = -11614.362319344677 + 1707.99430618145 * 100
Predicted costs of the 50 year old smoker group: $48165.44


In [135]:
# Create an array to predict smoker costs for bmi in ten year increments
X_costs = np.array([ 10, 20, 30, 40, 50])

# Format the array as a one-column array
X_costs = X_costs.reshape(-1,1)

# Display sample data
X_costs

array([[10],
       [20],
       [30],
       [40],
       [50]])

In [136]:
# Predict costs
predicted_costs = model.predict(X_costs)

In [137]:
# Create a DataFrame for the predicted costs
df_predicted_costs = pd.DataFrame(
    {
        "bmi": X_costs.reshape(1, -1)[0],
        "predicted_costs": predicted_costs
    }
)

# Display data
df_predicted_costs

Unnamed: 0,bmi,predicted_costs
0,10,5465.580742
1,20,22545.523804
2,30,39625.466866
3,40,56705.409928
4,50,73785.35299


Evaluate the Linear Regression Model

In [138]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.2599295498137477.
The r2 is 0.2599295498137477.
The mean squared error is 362016479.73070335.
The root mean squared error is 19026.730663219663.
The standard deviation is 22117.07234588049.


The r squared is less than .80 which means a bad model and can not use this system to predict future insurance costs for smokers.
In other words smokers bmi and insurance costs do not have any significant relationship per this model.  While it makes sense the costs would be lower for a smoker with a small Body Mass Index or BMI this model simply does not have the mathematical strength to reject a null hypothesis.