In [26]:
# Import the modules
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression


import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Import relevant metrics from scikit-learn: score, r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Read in the data from our RDBMS or our relational database

In [27]:
#for using a dbms system
Base = automap_base()

connection = psycopg2.connect(user="postgres", password="Corey1234",
                                  host="127.0.0.1", port="5432", database="Project_four")
cursor = connection.cursor()

    # Query all passengers
cost_data = """select inscosts.age, inscosts.bmi, inscosts.children, inscosts.charges
                from InsCosts
                    Where inscosts.smoker = 1
                        group by inscosts.age, inscosts.bmi,inscosts.children,inscosts.charges
                    Order by inscosts.age ASC;
"""
cursor.execute(cost_data)
results = cursor.fetchall()


In [28]:
#age	bmi	children	smoker	charges
ages = []
bmi = []
children = []
charges = []
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        #need to be all ints for a regression
        ages.append(x[0])
        bmi.append(int(x[1]))
        children.append(x[2])
        charges.append(int(x[3]))

Using the DBMS system to get information
Smoker information first

In [29]:
smoker_dc = {}
# create the data frame based on our lists 
convert_charges = np.array(charges, dtype=float)
smoker_dc = {"age": ages, "bmi":bmi,"children":children, "charges": convert_charges}
smoker_df = pd.DataFrame(smoker_dc)
smoker_df.head()


Unnamed: 0,age,bmi,children,charges
0,18,17,2,12829.0
1,18,21,0,13747.0
2,18,21,0,14283.0
3,18,25,0,15518.0
4,18,27,3,18223.0


In [30]:
df = smoker_df.copy()
y = df["charges"]
X = df.drop(columns="charges")

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(205, 3)

Next put the data into our machine learning model

In [32]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Model Accuracy


In [33]:
#X_test
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.000


This model is unaccpetable and should not be used other than showing what does not work

Using the regression prediction model let us further examine this dataset

In [34]:
sum_cost_data = """select inscosts.age, (sum(inscosts.charges) ) as sum_charges
                from InsCosts
                    Where inscosts.smoker = 1
                        group by inscosts.age
                    Order by inscosts.age ASC;"""
cursor.execute(sum_cost_data)
results = cursor.fetchall()


Use this data to build our X and y formula

In [35]:
#age	bmi	children	smoker	charges
ages = []
charges = []
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        ages.append(x[0])
        charges.append(x[1])

In [36]:
smoker2_dc = {}
# create the data frame based on our lists 
convert_charges = np.array(charges, dtype=int)
smoker2_dc = {"age": ages, "charges": convert_charges}
smoker2_df = pd.DataFrame(smoker2_dc)
smoker2_df.head()


Unnamed: 0,age,charges
0,18,305684
1,19,486022
2,20,221168
3,21,33301
4,22,224239


We want to visualize the data to see if there is a positive or negative linear progression of the data

In [37]:
#show the scatterplot
# Create a scatter plot with the sales information
smoker2_plot = smoker2_df.hvplot.scatter(
    x="age",
    y="charges",
    xlim=(17, 70), 
    ylim=(25000, 500000),
    title="Charges per age"
)
smoker2_plot

The data does not look bound and moving in a negative or positive direction... It looks stationary. A bad candidate for modeling any relationship

In [38]:
X = smoker2_df["age"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[18],
       [19],
       [20],
       [21],
       [22]], dtype=int64)

In [39]:
y = smoker2_df["charges"]
y[:5]

0    305684
1    486022
2    221168
3     33301
4    224239
Name: charges, dtype: int32

Build the model regardless of the bad looking graph

In [40]:
# Create a model with scikit-learn
model = LinearRegression()
# Fit the data into the model
model.fit(X, y)

In [41]:
# Display the slope
print(f"Model's slope: {model.coef_}")
print(f"Model's y-intercept: {model.intercept_}")
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's slope: [-655.90587419]
Model's y-intercept: 213737.67275670674
Model's formula: y = 213737.67275670674 + -655.905874190564X


Plot the best fit line for the cost model

In [42]:
predicted_y_values = model.predict(X)
df_smoker2_predicted = smoker2_df.copy()

# Add a column with the predicted sales values
predict_y = []  #make them ints not doubles
for i in predicted_y_values:
    predict_y.append(int(i))

df_smoker2_predicted["costs_predicted"] = predict_y

# Display sample data
df_smoker2_predicted.head()

Unnamed: 0,age,charges,costs_predicted
0,18,305684,201931
1,19,486022,201275
2,20,221168,200619
3,21,33301,199963
4,22,224239,199307


In [43]:
best_fit_line = df_smoker2_predicted.hvplot.line(
    x = "age",
    y = "costs_predicted",
    color = "red"
)
best_fit_line

put the best fit linear line into our scatter plot or real data from kaggle... Should not be good in terms of linear regressions

In [44]:
# Superpose the original data and the best fit line
smoker2_plot * best_fit_line

Looks like a very slow decline... Probably be a bad r squred. Meaning we could not reject a null hypothesis of insurance costs decline for smokers as they age.

In [45]:
#Next make the manual predictions

In [46]:
# Display the formula to predict the sales with 100 ads
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 100")

# Predict the sales with 100 ads
y_50 = model.intercept_ + model.coef_[0] * 50

# Display the prediction
print(f"Predicted costs of the 50 year old smoker group: ${y_50:.2f}")

Model's formula: y = 213737.67275670674 + -655.905874190564 * 100
Predicted costs of the 50 year old smoker group: $180942.38


In [47]:
# Create an array to predict smoker costs for ages 20-70 in ten year increments
X_costs = np.array([ 25, 35, 45, 55, 65, 70])

# Format the array as a one-column array
X_costs = X_costs.reshape(-1,1)

# Display sample data
X_costs

array([[20],
       [25],
       [30],
       [35],
       [40],
       [45],
       [50],
       [55],
       [60],
       [65],
       [70]])

In [48]:
# Predict costs
predicted_costs = model.predict(X_costs)

In [49]:
# Create a DataFrame for the predicted costs
df_predicted_costs = pd.DataFrame(
    {
        "ages": X_costs.reshape(1, -1)[0],
        "predicted_costs": predicted_costs
    }
)

# Display data
df_predicted_costs

Unnamed: 0,ages,predicted_costs
0,20,200619.555273
1,25,197340.025902
2,30,194060.496531
3,35,190780.96716
4,40,187501.437789
5,45,184221.908418
6,50,180942.379047
7,55,177662.849676
8,60,174383.320305
9,65,171103.790934


Evaluate the Linear Regression Model

In [50]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.011074764360821776.
The r2 is 0.011074764360821776.
The mean squared error is 7068541771.597527.
The root mean squared error is 84074.62025842.
The standard deviation is 84544.07651931803.


The r squared is less than .80 which means a bad model and can not use this system to predict future insurance costs for smokers.
In other words smokers age and insurance costs do not have any significant relationship per this model.