In [None]:
import pandas as pd
import numpy as np
import requests 
import io

from sklearn.ensemble import RandomForestClassifier
from  sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn

In [None]:
dataset = 'https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv'

csv_raw = requests.get(dataset).content
csv_io =  io.StringIO(csv_raw.decode('utf8'))
df = pd.read_csv(csv_io)
df.head()

In [None]:
df = df[df.select_dtypes(include=np.number).columns.tolist()]
df.drop(columns={'PassengerId'}, inplace=True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['Age'].describe()

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.to_csv('titanic_dataset.csv')

In [None]:
# separação dos dados em features e label
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
# separação dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# inicializando o experimento no MLFlow
mlflow.set_experiment("MLflow_Testing")

In [None]:
from sklearn.tree import DecisionTreeClassifier

with mlflow.start_run(run_name='baseline-model'):
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag","decision-tree: baseline model")
    
    # guardando os dados que foram usando para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    
    # podemos melhorar e guardar train e test
    pd.concat([X_train, y_train], axis=1).to_csv('train_dataset.csv')
    pd.concat([X_test, y_test], axis=1).to_csv('test_dataset.csv')
    
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    tree = DecisionTreeClassifier(random_state=42)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)      
    
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow    
    mlflow.log_params(tree.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(tree, artifact_path="sklearn-model")
    
    mlflow.end_run()

In [None]:
with mlflow.start_run(run_name='random-forest'):
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag",'random-forest')
    
    # guardando os dados que foram usandos para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred=rf.predict(X_test)
      
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow
    mlflow.log_params(rf.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(rf, artifact_path="sklearn-model")
    
    mlflow.end_run()

In [None]:
from sklearn.model_selection import GridSearchCV

with mlflow.start_run(run_name='rf-grid-search'):   
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag",'rf-grid-search')
    
    # guardando os dados que foram usandos para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    grid = {"n_estimators":[100, 200, 250],
            "criterion":["entropy"],
            "class_weight":["balanced", "balanced_subsample"],
            "bootstrap":[True, False],
            "max_features":["auto", "sqrt", "log2"]
            }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, 
                               cv=10, scoring='f1', error_score=0)
    grid_result = grid_search.fit(X_train, y_train)
    
    best_model = RandomForestClassifier(n_estimators=grid_result.best_params_["n_estimators"],
                                        criterion=grid_result.best_params_["criterion"],
                                        class_weight=grid_result.best_params_["class_weight"],
                                        bootstrap=grid_result.best_params_["bootstrap"],
                                        max_features=grid_result.best_params_["max_features"],
                                        random_state=42)

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred=best_model.predict(X_test)
      
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow
    mlflow.log_params(best_model.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(best_model, artifact_path="sklearn-model")
    
    mlflow.end_run()

In [None]:
with mlflow.start_run(run_name='rf-random-search-2'):   
    
    # ativar a "rodada"
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))
    
    # tag para a rodada
    mlflow.set_tag("tag",'rf-random-search-2')
    
    # guardando os dados que foram usandos para treinar o modelo
    mlflow.log_artifact('titanic_dataset.csv', artifact_path="features")
    mlflow.log_artifact('train_dataset.csv', artifact_path="features")
    mlflow.log_artifact('test_dataset.csv', artifact_path="features")
    
    # modelagem
    grid = {"n_estimators":[20, 60, 80],
            "max_depth": [5, 10, 15],
            "min_samples_split": [2, 5, 7, 10]
           }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, 
                               cv=10, scoring='f1', error_score=0)
    grid_result = grid_search.fit(X_train, y_train)
    
    best_model = RandomForestClassifier(n_estimators= grid_result.best_params_["n_estimators"],
                                        max_depth = grid_result.best_params_["max_depth"],
                                        min_samples_split = grid_result.best_params_["min_samples_split"],
                                        random_state=42)

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred=best_model.predict(X_test)
      
    # métricas do modelo
    metrics = {
    "precision" : precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1-score": f1_score(y_test, y_pred)
    }
    
    # salvando os hiperparametros do modelo no mlflow
    mlflow.log_params(best_model.get_params())
    
    # salvando as métricas do modelo no mlflow
    mlflow.log_metrics(metrics)
    
    # salvando o modelo no mlflow
    mlflow.sklearn.log_model(best_model, artifact_path="sklearn-model")
    
    mlflow.end_run()