In [7]:
import pandas as pd
import json
import sqlite3
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [9]:
#Get data table for training, this is the output of the pipeline, hence the name 'regression_table'
dbName = "../rest_server/medisch_centrum_randstad/data/db.sqlite3"
tableName = "regression_table"


dbConnection = sqlite3.connect(dbName)

#We doen een query en maken hier een pandas dataframe van
df = pd.read_sql_query(f"SELECT * FROM {tableName}", dbConnection)

#We sluiten de connectie
dbConnection.close()

In [10]:
#Selecting subset of complete dataframe containing only "relevant" variables:

df_updated = df[['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI', 'lifespan']]

In [11]:
#Making variations of the dataframe for comparison of different models:

#All parameters
v1 = ['genetic', 'exercise', 'smoking', 'alcohol', 'sugar', 'BMI']

#
v2 = ['genetic', 'exercise', 'smoking', 'BMI']

#
v3 = ['genetic', 'smoking']

#
v4 = ['genetic', 'exercise', 'smoking']

#
v5 = ['genetic', 'exercise', 'smoking', 'alcohol', 'sugar']

version_list = [v1,v2,v3,v4,v5]

In [3]:
models = []

def train_model(dataframe, version):
    
    x = dataframe[version]
    y = dataframe.loc[:, 'lifespan']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    
    predictions = model.predict(x_test)
    
    model_m_sqe = mean_squared_error(y_test, predictions)
    model_m_abse = mean_absolute_error(y_test, predictions)
    rmse = math.sqrt(model_m_sqe)
    r2 = r2_score(y_test, predictions)
    coefs = dict(list(zip(x.columns, model.coef_)))
    intercept = model.intercept_
    
#     print(f'Mean squared error: {model_m_sqe}')
#     print(f'Mean absolute error: {model_m_abse}')
#     print(f'R-squared value: {r2}', '\n')
#     print(f'Model coefficients: ')
#     for c in coefs:
#         print(f'{c[0]}: {c[1]}')
#     print('\n')
#     print(f'Model intercept: {intercept}')
    
    return {
            'model version': ', '.join(version),
            'mean squared error': model_m_sqe,
            'mean absolute error': model_m_abse,
            'r squared': r2,
            'root mean squared error': rmse,
            'coefficients': coefs,
            'intercept': intercept
    }


    
train_model(df_updated, v1)
    
    

NameError: name 'df_updated' is not defined

In [13]:
#Train model for different datasets
for v in version_list:
    models.append(train_model(df_updated, v))
    

In [18]:
#Printing all versions of our model
for d in models:
    print(f"Variables included in model: {d['model version']}.")
    print(f"Mean squared error of model: {d['mean squared error']}.")
    print(f"Mean absolute error of model: {d['mean absolute error']}.")
    print(f"R-squared of model: {d['r squared']}.")
    print(f"Root mean squared error: {d['root mean squared error']}.")
    print(f"Model coefficients: ")
    for k, v in d['coefficients'].items():
        print (k,v)
    print(f"Model intercept: {d['intercept']}")
    print("\n")

Variables included in model: genetic, exercise, smoking, alcohol, sugar, BMI.
Mean squared error of model: 1.1931064807713065.
Mean absolute error of model: 0.8202634084892075.
R-squared of model: 0.9808153218782916.
Root mean squared error: 1.0922941365636394.
Model coefficients: 
genetic 1.0022672389022234
exercise 0.8159847848408843
smoking -0.2633469007801697
alcohol -0.2355445754523253
sugar -0.07316098348665577
BMI -0.07228858254692938
Model intercept: 1.3563983509115047


Variables included in model: genetic, exercise, smoking, BMI.
Mean squared error of model: 1.339323152477292.
Mean absolute error of model: 0.8891780780836559.
R-squared of model: 0.9784642158974629.
Root mean squared error: 1.157291299750107.
Model coefficients: 
genetic 1.0023204859319785
exercise 0.8205703404962188
smoking -0.263031724915691
BMI -0.07210406230756007
Model intercept: 0.3182525390358535


Variables included in model: genetic, smoking.
Mean squared error of model: 2.633203189435365.
Mean absolu

In [230]:
#modellen exporteren naar pickle bestand (?)

[{'model version': 'genetic, exercise, smoking, alcohol, sugar, BMI', 'mean squared error': 1.1931064807713065, 'mean absolute error': 0.8202634084892075, 'r squared': 0.9808153218782916, 'root mean squared error': 1.0922941365636394, 'coefficients': {'genetic': 1.0022672389022234, 'exercise': 0.8159847848408843, 'smoking': -0.2633469007801697, 'alcohol': -0.2355445754523253, 'sugar': -0.07316098348665577, 'BMI': -0.07228858254692938}, 'intercept': 1.3563983509115047}, {'model version': 'genetic, exercise, smoking, BMI', 'mean squared error': 1.339323152477292, 'mean absolute error': 0.8891780780836559, 'r squared': 0.9784642158974629, 'root mean squared error': 1.157291299750107, 'coefficients': {'genetic': 1.0023204859319785, 'exercise': 0.8205703404962188, 'smoking': -0.263031724915691, 'BMI': -0.07210406230756007}, 'intercept': 0.3182525390358535}, {'model version': 'genetic, smoking', 'mean squared error': 2.633203189435365, 'mean absolute error': 1.2630220744385878, 'r squared': 