In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow

In [10]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'Projeto Kobe'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

In [13]:
train_perc = 0.8

with mlflow.start_run(experiment_id=experiment_id, run_name = 'PreparacaoDados'):
    dev_data = pd.read_parquet('../../data/raw/dataset_kobe_dev.parquet')

    dev_data.head()

    print('Dimensão do dataset:', dev_data.shape)   
        
    columns = ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']
    filtered_data = dev_data[columns].dropna()
        
    print('Dimensão do dataset filtrado:', filtered_data.shape)   

    filtered_data.to_parquet('../../data/processed/data_filtered.parquet')

    X = filtered_data.drop('shot_made_flag', axis=1)
    y = filtered_data['shot_made_flag']

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_perc, stratify=y, random_state=42)

    X_train.join(y_train).to_parquet("../../data/processed/base_train.parquet")
    X_test.join(y_test).to_parquet("../..//data/processed/base_test.parquet")
    
    mlflow.log_params({
        'perc-teste': 1-train_perc,
        'colunas-selecionadas': columns
        })
    mlflow.log_metrics({
        'qtd_linhas_treino': X_train.shape[0],
        'qtd_linhas_teste': X_test.shape[0]
        })

 

Dimensão do dataset: (24271, 25)
Dimensão do dataset filtrado: (20285, 7)
