In [1]:
import pandas as pd
import numpy as np
import requests 
import io

from sklearn.ensemble import RandomForestClassifier
from  sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn

In [2]:
dataset = 'https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv'

csv_raw = requests.get(dataset).content
csv_io =  io.StringIO(csv_raw.decode('utf8'))
df = pd.read_csv(csv_io)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df = df[df.select_dtypes(include=np.number).columns.tolist()]
df.drop(columns={'PassengerId'}, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [4]:
df.isnull().sum()

Survived      0
Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [5]:
df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [6]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df.isnull().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [8]:
df.to_csv('titanic_dataset.csv')

In [9]:
# separação dos dados em features e label
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [10]:
# separação dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# inicializando o experimento no MLFlow
mlflow.set_experiment("Titanic_Testing")

INFO: 'MLflow_Testing' does not exist. Creating a new experiment


In [12]:
from sklearn.tree import DecisionTreeClassifier

with mlflow.start_run(run_name='baseline-model'):
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag","decision-tree: baseline model")
    
    # guardando os dados que foram usando para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    
    # podemos melhorar e guardar train e test
    pd.concat([X_train, y_train], axis=1).to_csv('train_dataset.csv')
    pd.concat([X_test, y_test], axis=1).to_csv('test_dataset.csv')
    
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    tree = DecisionTreeClassifier(random_state=42)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)      
    
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow    
    mlflow.log_params(tree.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(tree, artifact_path="sklearn-model")
    
    mlflow.end_run()

Active run_id: b84901d522674f8f8e335bb402d264d9


In [13]:
with mlflow.start_run(run_name='random-forest'):
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag",'random-forest')
    
    # guardando os dados que foram usandos para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred=rf.predict(X_test)
      
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow
    mlflow.log_params(rf.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(rf, artifact_path="sklearn-model")
    
    mlflow.end_run()

Active run_id: b92306b7a411451a888ee457919b8d0b


In [14]:
from sklearn.model_selection import GridSearchCV

with mlflow.start_run(run_name='rf-grid-search'):   
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag",'rf-grid-search')
    
    # guardando os dados que foram usandos para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    grid = {"n_estimators":[100, 200, 250],
            "criterion":["entropy"],
            "class_weight":["balanced", "balanced_subsample"],
            "bootstrap":[True, False],
            "max_features":["auto", "sqrt", "log2"]
            }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, 
                               cv=10, scoring='f1', error_score=0)
    grid_result = grid_search.fit(X_train, y_train)
    
    best_model = RandomForestClassifier(n_estimators=grid_result.best_params_["n_estimators"],
                                        criterion=grid_result.best_params_["criterion"],
                                        class_weight=grid_result.best_params_["class_weight"],
                                        bootstrap=grid_result.best_params_["bootstrap"],
                                        max_features=grid_result.best_params_["max_features"],
                                        random_state=42)

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred=best_model.predict(X_test)
      
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow
    mlflow.log_params(best_model.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(best_model, artifact_path="sklearn-model")
    
    mlflow.end_run()

Active run_id: 82b7590fb3da41079bc3288aea13eb52


MissingConfigException: Yaml file '/home/user/Documents/git/aulas_light_aplicadas/mlflow_intro/mlruns/1/82b7590fb3da41079bc3288aea13eb52/meta.yaml' does not exist.

In [None]:
with mlflow.start_run(run_name='rf-random-search-2'):   
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag",'rf-random-search-2')
    
    # guardando os dados que foram usandos para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    grid = {"n_estimators":[20, 60, 80],
            "max_depth": [5, 10, 15],
            "min_samples_split": [2, 5, 7, 10]
           }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, 
                               cv=10, scoring='f1', error_score=0)
    grid_result = grid_search.fit(X_train, y_train)
    
    best_model = RandomForestClassifier(n_estimators= grid_result.best_params_["n_estimators"],
                                        max_depth = grid_result.best_params_["max_depth"],
                                        min_samples_split = grid_result.best_params_["min_samples_split"],
                                        random_state=42)

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred=best_model.predict(X_test)
      
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow
    mlflow.log_params(best_model.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(best_model, artifact_path="sklearn-model")
    
    mlflow.end_run()