## 1. Basic Flow

In [7]:
from sklearn.datasets import load_wine 
import pandas as pd

wine_data = load_wine()

X = wine_data.data

y = wine_data.target

X = pd.DataFrame(X)

In [2]:
X.memory_usage()

Index     128
0        1424
1        1424
2        1424
3        1424
4        1424
5        1424
6        1424
7        1424
8        1424
9        1424
10       1424
11       1424
12       1424
dtype: int64

In [4]:
X[0].astype('float16').memory_usage()

484

In [5]:
X[0].memory_usage()

1552

In [6]:
pd.to_numeric(X[0],downcast="float").memory_usage()

840

In [None]:
14.49857941324234235124

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [9]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
139,12.84,2.96,2.61,24.0,101.0,2.32,0.60,0.53,0.81,4.92,0.89,2.15,590.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
161,13.69,3.26,2.54,20.0,107.0,1.83,0.56,0.50,0.80,5.88,0.96,1.82,680.0
34,13.51,1.80,2.65,19.0,110.0,2.35,2.53,0.29,1.54,4.20,1.10,2.87,1095.0
160,12.36,3.83,2.38,21.0,88.0,2.30,0.92,0.50,1.04,7.65,0.56,1.58,520.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,13.41,3.84,2.12,18.8,90.0,2.45,2.68,0.27,1.48,4.28,0.91,3.00,1035.0
76,13.03,0.90,1.71,16.0,86.0,1.95,2.03,0.24,1.46,4.60,1.19,2.48,392.0
142,13.52,3.17,2.72,23.5,97.0,1.55,0.52,0.50,0.55,4.35,0.89,2.06,520.0
89,12.08,1.33,2.30,23.6,70.0,2.20,1.59,0.42,1.38,1.74,1.07,3.21,625.0


In [10]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0
110,11.46,3.74,1.82,19.5,107.0,3.18,2.58,0.24,3.58,2.9,0.75,2.81,562.0
60,12.33,1.1,2.28,16.0,101.0,2.05,1.09,0.63,0.41,3.27,1.25,1.67,680.0
121,11.56,2.05,3.23,28.5,119.0,3.18,5.08,0.47,1.87,6.0,0.93,3.69,465.0
119,12.0,3.43,2.0,19.0,87.0,2.0,1.64,0.37,1.87,1.28,0.93,3.05,564.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0
149,13.08,3.9,2.36,21.5,113.0,1.41,1.39,0.34,1.14,9.4,0.57,1.33,550.0
100,12.08,2.08,1.7,17.5,97.0,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710.0
157,12.45,3.03,2.64,27.0,97.0,1.9,0.58,0.63,1.14,7.5,0.67,1.73,880.0
98,12.37,1.07,2.1,18.5,88.0,3.52,3.75,0.24,1.95,4.5,1.04,2.77,660.0


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_centered = scaler.fit_transform(X_train)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 1)
Xpc = pca.fit_transform(X_train_centered)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

model = LogisticRegression()
cross_val_score(model, Xpc, y_train, cv=10).mean()

In [None]:
model.fit(Xpc,y_train)

In [None]:
X_test_centered = scaler.transform(X_test)

X_test_pc = pca.transform(X_test_centered)

pred = model.predict(X_test_pc)

In [None]:
from sklearn.metrics import accuracy_score

print("This model got", accuracy_score(y_test, pred)*100, "% of predictions right.")

### First New Thing: saving your model to Disk

In [None]:
import joblib

joblib.dump(model,'model.joblib')

In [None]:
good_model = joblib.load('model.joblib')

In [None]:
new_pred = good_model.predict(X_test_pc)

print("This model got", accuracy_score(y_test, new_pred)*100, "% of predictions right.")

## 2. Repeatable flow with Pipelines

In [None]:
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

wine_data = load_wine()

X = wine_data.data

y = wine_data.target

In [None]:
pipe = Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA()),('model',LogisticRegression())],memory="cache")

In [None]:
pipe.set_params(pca__n_components=1)

In [None]:
cross_val_score(pipe,X_train,y_train,cv=10).mean()

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test,y_test)

In [None]:
joblib.dump(pipe,'pipeline.joblib')

In [None]:
good_pipeline = joblib.load('pipeline.joblib')

In [None]:
good_pipeline

## 3. Repeatable flow with Pipelines Generalized

In [None]:
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

wine_data = load_wine()

X = wine_data.data

y = wine_data.target

In [None]:
def train(model,n_components):
    pipe = Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA()),('model',model)],memory="local_path")
    pipe.set_params(pca__n_components=n_components)
    mean_cv = cross_val_score(pipe,X_train,y_train,cv=10).mean()
    print(f'model got {mean_cv} Cross-Validation score')
    
    return pipe

In [None]:
for model in [LogisticRegression(),DecisionTreeClassifier(),SVC()]:
    train(model,n_components=1)

In [None]:
pipe = train(LogisticRegression(),1)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pipe.score(X_test,y_test)

### Another Way of iterating through models

In [None]:
other_pipe = Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA()),('model',model)])

In [None]:
param_grid = { 'pca__n_components': [1, 2, 3, 4, 5],
    'model': [LogisticRegression(),DecisionTreeClassifier(),SVC()]}

search = GridSearchCV(other_pipe, param_grid, n_jobs=-1)
search.fit(X_train, y_train)
print(f"Best parameters (CV score={search.best_score_}):")
print(search.best_params_)

In [None]:
best_pipe =  Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA(n_components=5)),('model',LogisticRegression())])

In [None]:
best_pipe.fit(X_train,y_train)

In [None]:
best_pipe.score(X_test,y_test)

In [None]:
joblib.dump(best_pipe,'best_pipe.joblib')

## 4. Repeatable flow with Pipelines even more generalized

In [None]:
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

class Trainer(object):
    
    def __init__(self, X, y, **kwargs):
        self.pipeline = None
        self.kwargs = kwargs
        self.training_error = None
        self.test_error = None
        
        self.X_train, self.X_val, self.y_train, self.y_val =\
        train_test_split(X, y, test_size=self.kwargs.get("test_size"))
        
        self.alive()
        
    def alive(self):
        print("I'm alive!")   

    def get_estimator(self):
        estimator = self.kwargs.get("estimator")
        if estimator == "LogisticRegression":
            model = LogisticRegression(**self.kwargs.get("model_params"))
        if estimator == "SVC":
            model = SVC(**self.kwargs.get("model_params"))
        return model
    
    def set_pipeline(self):   
        steps = []
            
        if self.kwargs.get("scale_data"):
            steps.append('sacler', StandardScaler())
        if self.kwargs.get("pca"):
             steps.append(('pca',PCA()))
                
        steps.append(('model',self.get_estimator()))
            
        self.pipeline = Pipeline(steps)
        
        if self.kwargs.get("pca_params"):
            self.pipeline.set_params(**self.kwargs.get("pca_params"))
        
    def train(self):
        self.set_pipeline()
        self.pipeline.fit(self.X_train, self.y_train)
        mean_cv = cross_val_score(self.pipeline,self.X_train,self.y_train,cv=10).mean()
        print(f'model got {mean_cv} Cross-Validation score')
        self.training_error = mean_cv
        
    def evaluate(self):
        accuracy = self.pipeline.score(self.X_val,self.y_val)
        print(f'model got {accuracy} Test Set score')
        self.test_error = accuracy
        
    def save_model(self,filename):
        joblib.dump(self.pipeline,filename)
        
    

    

In [None]:
params = dict(test_size=0.2,
              pca=True,
              scaler=True,
              estimator="SVC",
              pca_params={'pca__n_components':13},
              model_params={'kernel':'linear'})

my_trainer = Trainer(X,y,**params)

In [None]:
my_new_trainer = Trainer(X,y,test_size=0.2,pca=True,scaler=True,estimator="SVC",pca_params={'pca__n_components':13},model_params={'kernel':'linear'})

In [None]:
new_new_trainer = Trainer(X,y,**params)

In [None]:
my_new_trainer.pipeline

In [None]:
my_trainer.train()

In [None]:
my_trainer.evaluate()

In [None]:
my_trainer.training_error

In [None]:
my_trainer.save_model('SVM_pipeline.joblib')

## 5. Sharing experiments with others: MLflow

In [None]:
from  mlflow.tracking import MlflowClient

EXPERIMENT_NAME = "cool_experiment_woop_woop"
client = MlflowClient()
experiment_id = client.create_experiment(EXPERIMENT_NAME)

for model in ["linear", "Randomforest"]:
    run = client.create_run(experiment_id)
    client.log_metric(run.info.run_id, "rmse", 4.5)
    client.log_param(run.info.run_id, "model", model)

In [None]:
!mlflow ui

In [None]:
EXPERIMENT_NAME = "nice_experiment"
client = MlflowClient()
experiment_id = client.create_experiment(EXPERIMENT_NAME)    


In [None]:
all_params = {"LogisticRegression":{"solver":"liblinear"}, "SVC":{"kernel":"linear"}}

for model in ["LogisticRegression", "SVC"]:

    run = client.create_run(experiment_id)
    
    params = dict(test_size=0.2,
              pca=True,
              scaler=True,
              estimator=model,
              pca_params={'pca__n_components':13},
              model_params=all_params.get(model))
    
    my_trainer = Trainer(X,y,**params)
    
    my_trainer.train()
    my_trainer.evaluate()
        
    client.log_metric(run.info.run_id, "training_error", my_trainer.training_error)
    client.log_metric(run.info.run_id, "test_error", my_trainer.test_error)
    client.log_param(run.info.run_id, "model", params.get("estimator"))
    client.log_param(run.info.run_id, "pca_components", params.get("pca_params"))
    client.log_param(run.info.run_id, "model_params", params.get("model_params"))
    
    

In [None]:
!mlflow ui

## Memoized property demo

Memoized propeties are a trick you can use to "freeze" variables, values returned by methods or attributes of an object that otherwise changes everytime you instantiate the object or run a method.

See the example below. If you run the example with @memoized_property uncommented, you will notice that the value retuned <code>get_random_value</code> will no longer change if you call it again with the same "Car".

In [None]:
from memoized_property import memoized_property
from numpy import random

class Car():
    @memoized_property
    def get_random_value(self):
        return random.randint(0,10)

car = Car()
print('non memoized calls differ:')
print(car.get_random_value)
print(car.get_random_value)

car2 = Car()
print('non memoized calls differ:')
print(car2.get_random_value)
print(car2.get_random_value)

## Final Trainer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score


import mlflow
from  mlflow.tracking import MlflowClient
from memoized_property import memoized_property

class Trainer(object):
    
    #MLFLOW_URI = "https://mlflow.lewagon.co/"
    
    def __init__(self, X, y,experiment_name, **kwargs):
        self.pipeline = None
        self.kwargs = kwargs
        self.training_error = None
        self.test_error = None
        self.experiment_name = experiment_name
        
        self.X_train, self.X_val, self.y_train, self.y_val =\
        train_test_split(X, y, test_size=self.kwargs.get("test_size"))

    @memoized_property
    def mlflow_client(self):
        #mlflow.set_tracking_uri(self.MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client \
                .create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client \
                .get_experiment_by_name(self.experiment_name).experiment_id

    def mlflow_create_run(self):
        self.mlflow_run = self.mlflow_client \
            .create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client \
            .log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client \
            .log_metric(self.mlflow_run.info.run_id, key, value)


    def get_estimator(self):
        estimator = self.kwargs.get("estimator")
        if estimator == "LogisticRegression":
            model = LogisticRegression(**self.kwargs.get("model_params"))
        if estimator == "SVC":
            model = SVC(**self.kwargs.get("model_params"))
        return model
    
    def set_pipeline(self):   
        steps = []
            
        if self.kwargs.get("scale_data"):
            steps.append('sacler', StandardScaler())
        if self.kwargs.get("pca"):
             steps.append(('pca',PCA()))
                
        steps.append(('model',self.get_estimator()))
            
        self.pipeline = Pipeline(steps)
        
        if self.kwargs.get("pca_params"):
            self.pipeline.set_params(**self.kwargs.get("pca_params"))
        
    def train(self):
        self.set_pipeline()
        self.pipeline.fit(self.X_train, self.y_train)
        mean_cv = cross_val_score(self.pipeline,self.X_train,self.y_train,cv=10).mean()
        print(f'model got {mean_cv} Cross-Validation score')
        self.training_error = mean_cv
        
        self.mlflow_create_run()
        self.mlflow_log_metric("training_error", mean_cv)
        self.mlflow_log_param("model", self.kwargs.get("estimator"))
        
    def evaluate(self):
        accuracy = self.pipeline.score(self.X_val,self.y_val)
        print(f'model got {accuracy} Test Set score')
        self.test_error = accuracy
        
        self.mlflow_log_metric("test_error", accuracy)
        
    def save_model(self,filename):
        joblib.dump(self.pipeline,filename)

In [None]:
experiment_name = 'mother_of_all_experiments'

my_trainer = Trainer(X,y,experiment_name,**params)

In [None]:
my_trainer.train()

In [None]:
my_trainer.evaluate()