In [55]:
# Import the modules
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression


import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Import relevant metrics from scikit-learn: score, r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Read in the data from our RDBMS or our relational database

In [56]:
#for using a dbms system
Base = automap_base()

connection = psycopg2.connect(user="postgres", password="Corey1234",
                                  host="127.0.0.1", port="5432", database="Project_four")
cursor = connection.cursor()

# Query all passengers
cost_data = """select *
  from TobaccoUse
        Where TobaccoUse.location1 <> ''
    group by  TobaccoUse.tobaccoId, TobaccoUse.Years, TobaccoUse.StateRegion,
	TobaccoUse.EveryDay,TobaccoUse.SomeDays,
	TobaccoUse.formerSmoker,TobaccoUse.neverSmoker,TobaccoUse.location1
                    Order by TobaccoUse.Years, TobaccoUse.stateregion ASC;
"""
cursor.execute(cost_data)
results = cursor.fetchall()


In [57]:
years = []
states = []
smoker = []
occasional = []
smokes = 0
occ = 0
#within the return set get our lists and then create a dataframe
for x in results:
        # go through the returned cursor and create a pandas dataframe out of it
        #     print(x[0])
        # get the return cursor build lists for our dictionary
        #need to be all ints for a regression
        years.append(int(x[1]))
        smokes = x[3]*100
        states.append(x[2])
        smoker.append(int(smokes))
        occ = x[4] * 100
        occasional.append(int(occ))

Using the DBMS system to get information
Smoker information first

In [58]:
smoker_dc = {}
# create the data frame based on our lists 
convert_smoker = np.array(smoker, dtype=float)
convert_occ = np.array(occasional, dtype=float)

smoker_dc = {"years": years, "state":states,"smoker":smoker, "occasional": occasional}
smoker_df = pd.DataFrame(smoker_dc)
smoker_df.head()


Unnamed: 0,years,state,smoker,occasional
0,1995,Alabama,22,2
1,1995,Alaska,22,3
2,1995,Arizona,20,2
3,1995,Arkansas,23,2
4,1995,California,13,3


In [59]:
df = smoker_df.copy()
y = df["smoker"]
X = df.drop(columns=["smoker","state"])

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(626, 2)

Next put the data into our machine learning model

In [61]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Model Accuracy


In [62]:
#X_test
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.096


This model is unaccpetable and should not be used other than showing what does not work

Using the regression prediction model let us further examine this dataset

In [63]:
smokers_df = df.copy()
smoker_plot_df = smokers_df.drop(columns='state',axis=1)
smoker_plot_df.head()

Unnamed: 0,years,smoker,occasional
0,1995,22,2
1,1995,22,3
2,1995,20,2
3,1995,23,2
4,1995,13,3


We want to visualize the data to see if there is a positive or negative linear progression of the data

In [64]:
#show the scatterplot
# Create a scatter plot with the sales information
smoker2_plot = smoker_plot_df.hvplot.scatter(
    x="years",
    y="smoker",
    xlim=(1994, 2011), 
    ylim=(0, 40),
    title="USA smokers by year"
)
smoker2_plot

The data does not look bound and moving in a negative or positive direction... It looks stationary. A bad candidate for modeling any relationship

In [65]:
X = smoker_plot_df["years"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[1995],
       [1995],
       [1995],
       [1995],
       [1995]], dtype=int64)

In [66]:
y = smoker_plot_df["smoker"]
y[:5]

0    22
1    22
2    20
3    23
4    13
Name: smoker, dtype: int64

Build the model regardless of the bad looking graph

In [67]:
# Create a model with scikit-learn
model = LinearRegression()
# Fit the data into the model
model.fit(X, y)

In [68]:
# Display the slope
print(f"Model's slope: {model.coef_}")
print(f"Model's y-intercept: {model.intercept_}")
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's slope: [-0.47507824]
Model's y-intercept: 967.9701301514266
Model's formula: y = 967.9701301514266 + -0.4750782436761481X


Plot the best fit line for the cost model

In [69]:
predicted_y_values = model.predict(X)
df_smoker2_predicted = smoker_plot_df.copy()

# Add a column with the predicted sales values
predict_y = []  #make them ints not doubles
for i in predicted_y_values:
    predict_y.append(int(i))

df_smoker2_predicted["smokers_predicted"] = predict_y

# Display sample data
df_smoker2_predicted.head()

Unnamed: 0,years,smoker,occasional,smokers_predicted
0,1995,22,2,20
1,1995,22,3,20
2,1995,20,2,20
3,1995,23,2,20
4,1995,13,3,20


In [70]:
best_fit_line = df_smoker2_predicted.hvplot.line(
    x = "years",
    y = "smokers_predicted",
    color = "red"
)
best_fit_line

put the best fit linear line into our scatter plot or real data from kaggle... Should not be good in terms of linear regressions

In [71]:
# Superpose the original data and the best fit line
smoker2_plot * best_fit_line

Looks like a very slow decline... Probably be a bad r squred. Meaning we could not reject a null hypothesis of smoker decline as times progress.

In [72]:
# Display the formula to predict the sales with 100 ads
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} / 100")

# Predict the sales with 100 ads
y_50 = (model.intercept_ + model.coef_[0])/100 

# Display the prediction
print(f"Predicted smokers in 2011 year would be: {y_50:.2f}%")

Model's formula: y = 967.9701301514266 + -0.4750782436761481 / 100
Predicted smokers in 2011 year would be: 9.67%


In [73]:
# Create an array to predict smokers for thenext 12 years
X_costs = np.array([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,2019,2020,2021,2022,2023,2024])

# Format the array as a one-column array
X_costs = X_costs.reshape(-1,1)

# Display sample data
X_costs

array([[2011],
       [2012],
       [2013],
       [2014],
       [2015],
       [2016],
       [2017],
       [2018],
       [2019],
       [2020],
       [2021],
       [2022],
       [2023],
       [2024]])

In [74]:
# Predict decline
predicted_costs = model.predict(X_costs)

In [75]:
# Create a DataFrame for the predicted costs
df_predicted_costs = pd.DataFrame(
    {
        "years": X_costs.reshape(1, -1)[0],
        "predicted_smokers": predicted_costs
    }
)

# Display data
df_predicted_costs

Unnamed: 0,years,predicted_smokers
0,2011,12.587782
1,2012,12.112704
2,2013,11.637626
3,2014,11.162547
4,2015,10.687469
5,2016,10.212391
6,2017,9.737313
7,2018,9.262234
8,2019,8.787156
9,2020,8.312078


Evaluate the Linear Regression Model

In [76]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.29992154558247863.
The r2 is 0.29992154558247863.
The mean squared error is 11.065598345719783.
The root mean squared error is 3.3264994131548833.
The standard deviation is 3.975704479793855.
