In [13]:
from pandas import read_csv
from numpy import random, sqrt
from supabase_py import create_client, Client
from json import load
from hashlib import md5
from datetime import datetime
from pickle import dump
import pickle
from numpy import array

In [2]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

In [3]:
now = str(datetime.today()).replace(" ", "").replace(":", "").replace("-", "").replace(".", "")

In [4]:
# connection to Supabase
cred = load(open('../configuration/sbConfig.json'))
supabase: Client = create_client(cred["db"], cred["key"])

In [5]:
raw_path = "../data/raw/"
clean_path = "../data/cleaned/"
model_path = "../model/"

# to be changed

In [6]:
raw_file = "winequality-red.csv"
cleaned_file = "winequality-red.csv"

In [7]:
def eval_metrics(actual, pred):
    rmse = sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# pre-processing

In [8]:
raw_df = read_csv(f"{raw_path}{raw_file}", sep=";")
raw_df.columns = [i.replace(" ","_") for i in raw_df.columns]
'''

some operation here to clean it, feature engineering, stadardizing, normalizing, reducing etc

''' 
raw_df.to_csv(f"{clean_path}{cleaned_file}", sep=";",index=False)

# after pre-processing

In [9]:
df = read_csv(f"{clean_path}{cleaned_file}", sep=";")
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# !!!! to be changed for each traning !!!!

In [11]:
# to be tracked, and updated during each run
model_name = "regression_tree_3.pkl"
model_group = "regression_tree"
target = "quality"
rmse = 0
mae = 0
r2 = 0
upload_data = {
    "target":target,
    "features": [i for i in df.columns if i!= target],
    "model_group":model_group,
    "model_name":model_name, # this is to compare all the related models
    "data_name":f'{cleaned_file.split(".")[0]}_{now}.{cleaned_file.split(".")[1]}',
    "parameters":{"max_depth":20,\
                  "criterion":"mae",\
                  "test_split":0.25, \
                  "random":43, \
                  "cv":15},
    "result":{"rmse":rmse,\
              "mae":mae,\
              "r2":r2},
}
upload_data

{'target': 'quality',
 'features': ['fixed_acidity',
  'volatile_acidity',
  'citric_acid',
  'residual_sugar',
  'chlorides',
  'free_sulfur_dioxide',
  'total_sulfur_dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol'],
 'model_name': 'regression_tree_3.pkl',
 'data_name': 'winequality-red_20211207104941234987.csv',
 'parameters': {'max_depth': 20,
  'criterion': 'mae',
  'test_split': 0.25,
  'random': 43,
  'cv': 15},
 'result': {'rmse': 0, 'mae': 0, 'r2': 0}}

# splitting data

In [34]:
X = df[[i for i in df.columns if i!=upload_data["target"]]]
y = df[[i for i in df.columns if i==upload_data["target"]]]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=upload_data["parameters"]["test_split"], random_state=upload_data["parameters"]["random"])

# buiilding the model

In [36]:
regressor = DecisionTreeRegressor(random_state=upload_data["parameters"]["random"],\
                                  criterion=upload_data["parameters"]["criterion"],\
                                  max_depth=upload_data["parameters"]["max_depth"])

In [37]:
regressor.fit(X_train, y_train)
predicted_qualities = regressor.predict(X_test)
(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

# save model

In [38]:
with open(f'{model_path}{upload_data["model_name"]}', 'wb') as f:
    dump(regressor, f)

# uploading data

In [61]:
upload_data["result"]={"rmse":rmse,\
              "mae":mae,\
              "r2":r2}
upload_data

{'target': 'quality',
 'model_name': 'regression_tree_3.pkl',
 'data_name': 'winequality-red_20211205210754006272.csv',
 'parameters': {'max_depth': 20,
  'criterion': 'mae',
  'test_split': 0.25,
  'random': 43,
  'cv': 15},
 'result': {'rmse': 0.7407766195014527,
  'mae': 0.4275,
  'r2': 0.1517809701384395}}

In [40]:
upload_meta = supabase.table("tracker").insert(upload_data).execute()
if(upload_meta["status_code"] > 400):
    print(f"FAILED To store the data, check if necessary changes like model name and paramters are updated")
else:
    print(f'Model data SUCCESSFULLY stored')

Model data SUCCESSFULLY stored


# uploading model (WIP)

In [None]:
upload_model = supabase.storage.from('models').upload(f'{model_path}{upload_data["model_name"]}', file).execute()

# uplaoding data (WIP)

In [None]:
upload_data = supabase.storage.from('data').upload(f'{clean_path}{upload_data["data_name"]}', file).execute()

# check out the results in webpage (Standalone, need not run the code)

In [21]:
!streamlit run ../ui/app.py

^C
