In [15]:
# Import the modules
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression


import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Import relevant metrics from scikit-learn: score, r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Read in the data from our RDBMS or our relational database

In [16]:
#for using a dbms system
Base = automap_base()

connection = psycopg2.connect(user="postgres", password="Corey1234",
                                  host="127.0.0.1", port="5432", database="Project_four")
cursor = connection.cursor()

    # Query all passengers
cost_data = """select inscosts.age, inscosts.bmi, inscosts.children, inscosts.charges
                from InsCosts
                    Where inscosts.smoker = 0
                        group by inscosts.age, inscosts.bmi,inscosts.children,inscosts.charges
                    Order by inscosts.age ASC;
"""
cursor.execute(cost_data)
results = cursor.fetchall()


In [17]:
#age	bmi	children	smoker	charges
ages = []
bmi = []
children = []
charges = []
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        #need to be all ints for a regression
        ages.append(x[0])
        bmi.append(int(x[1]))
        children.append(x[2])
        charges.append(int(x[3]))

Using the DBMS system to get information
Smoker information first

In [18]:
smoker_dc = {}
# create the data frame based on our lists 
convert_charges = np.array(charges, dtype=float)
smoker_dc = {"age": ages, "bmi":bmi,"children":children, "charges": convert_charges}
smoker_df = pd.DataFrame(smoker_dc)
smoker_df.head()


Unnamed: 0,age,bmi,children,charges
0,18,15,0,1694.0
1,18,20,0,1607.0
2,18,21,0,1702.0
3,18,21,2,11884.0
4,18,22,0,1704.0


In [19]:
df = smoker_df.copy()
y = df["charges"]
X = df.drop(columns="charges")

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(797, 3)

Next put the data into our machine learning model

In [21]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [22]:
# Create a model with scikit-learn
model = LinearRegression()
# Fit the data into the model
model.fit(X, y)

In [23]:
# Display the slope
print(f"Model's slope: {model.coef_}")
print(f"Model's y-intercept: {model.intercept_}")
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} x age + {model.coef_[1]} x bmi + {model.coef_[2]} x number of children ")

Model's slope: [265.23227981   5.06960469 580.23379725]
Model's y-intercept: -2797.282769287358
Model's formula: y = -2797.282769287358 + 265.23227981216843 x age + 5.069604692216493 x bmi + 580.2337972450348 x number of children 


In [24]:
predicted_y_values = model.predict(X)

# Add a column with the predicted sales values
predict_y = []  #make them ints not doubles
for i in predicted_y_values:
    predict_y.append(int(i))

df_smoker2_predicted = X.copy()
df_smoker2_predicted["costs_predicted"] = predict_y

# Display sample data
df_smoker2_predicted.head()

Unnamed: 0,age,bmi,children,costs_predicted
0,18,15,0,2052
1,18,20,0,2078
2,18,21,0,2083
3,18,21,2,3243
4,18,22,0,2088


put the best fit linear line into our scatter plot or real data from kaggle.

In [25]:
# Display the formula
print(f"Model's formula: y = {model.intercept_:.2f} + {model.coef_[0]:.2f} x 50 + {model.coef_[1]:.2f} x 30 + {model.coef_[2]:.2f} x 2 ")

print("For a fifty year old person with BMI 30 and 2 children the formula would look like so:")

y_50 = model.intercept_ + (model.coef_[0] * 50) + (model.coef_[1] * 30) + (model.coef_[2] * 2)

# Display the prediction
print(f" ${y_50:.2f}")

Model's formula: y = -2797.28 + 265.23 x 50 + 5.07 x 30 + 580.23 x 2 
For a fifty year old person with BMI 30 and 2 children the formula would look like so:
 $11776.89


In [26]:
print(f"Model's formula: y = {model.intercept_:.2f} + {model.coef_[0]:.2f} x 40 + {model.coef_[1]:.2f} x 20 + {model.coef_[2]:.2f} x 1 ")

print("For a forty year old person with BMI 20 and 1 child the formula would look like so:")

y_40 = model.intercept_ + (model.coef_[0] * 40) + (model.coef_[1] * 20) + (model.coef_[2] * 1)
print(f" ${y_40:.2f}")

Model's formula: y = -2797.28 + 265.23 x 40 + 5.07 x 20 + 580.23 x 1 
For a forty year old person with BMI 20 and 1 child the formula would look like so:
 $8493.63


Evaluate the Linear Regression Model

In [28]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.4075721981097189.
The r2 is 0.4075721981097189.
The mean squared error is 21257435.43009376.
The root mean squared error is 4610.578643738089.
The standard deviation is 5990.150288751164.
