In [None]:
%pip install mlflow

In [None]:
%pip install boto3

In [None]:
%pip install dvc

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from dotenv import load_dotenv
from yaml import load, Loader
import yaml
from pathlib import Path
import dvc.api
import mlflow

load_dotenv()

MODEL_1_LOAD_DATA_PATH=os.environ.get("MODEL_1_LOAD_DATA_PATH")
MODEL_1_TRANSFORM_DATA_PATH=os.environ.get("MODEL_1_TRANSFORM_DATA_PATH")
MODEL_1_TRAIN_DATA_PATH=os.environ.get("MODEL_1_TRAIN_DATA_PATH")
MODEL_1_MODEL_DATA_PATH=os.environ.get("MODEL_1_MODEL_DATA_PATH")
MODEL_1_TRAIN_CONFIG_PATH=os.environ.get("MODEL_1_TRAIN_CONFIG_PATH")
MODEL_1_SAVED_PARAMS = os.environ.get("MODEL_1_SAVED_PARAMS")
MODEL_1_TRAIN_FILE_DATA_PATH = os.environ.get("MODEL_1_TRAIN_FILE_DATA_PATH")

MLFLOW_ENDPOINT_URL = os.environ.get("MLFLOW_ENDPOINT_URL")


mlflow.set_tracking_uri(MLFLOW_ENDPOINT_URL)

with mlflow.start_run():

    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))

    with open(MODEL_1_TRAIN_CONFIG_PATH, "r") as conf:
        train_config = load(conf, Loader=Loader)["train_config"]

    dataset = pd.read_csv(str(Path(MODEL_1_TRANSFORM_DATA_PATH) / 'prepared_data.csv' ))
    target = pd.read_csv(str(Path(MODEL_1_LOAD_DATA_PATH) / "target.csv" ))  #Fix this later

    train_index, validation_index = train_test_split(dataset.index, 
                                                     test_size=train_config["validation_size"])

    train_index, test_index = train_test_split(train_index, 
                                                   test_size=train_config["test_size"])

    model = LinearRegression()
    model.fit(dataset.loc[train_index], target.loc[train_index])

    train_mse = mean_squared_error(target.loc[test_index], 
                                   model.predict(dataset.loc[test_index]))

    test_mse = mean_squared_error(target.loc[train_index], 
                                  model.predict(dataset.loc[train_index]))

    validation_mse = mean_squared_error(target.loc[validation_index], 
                                        model.predict(dataset.loc[validation_index]))
    

    mlflow.log_param("run_id", run.info.run_id)
    mlflow.log_param("data_path", str(dvc.api.get_url(path=MODEL_1_TRAIN_FILE_DATA_PATH)))

    mlflow.log_metric("train_mse", train_mse)
    mlflow.log_metric("test_mse", test_mse)
    mlflow.log_metric("validation_mse", validation_mse)
    
    mlflow.sklearn.log_model(model, "model")

    params = {
        'run_id': run.info.run_id
    }
    
    with open(MODEL_1_SAVED_PARAMS, 'w') as pfile:
        yaml.dump(params, pfile, default_flow_style=False)
