In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
stop = list(fr_stop)

## Load data

In [16]:
train = pd.read_csv('data/train.csv', delimiter=',')
test = pd.read_csv('data/test.csv',  delimiter=',')
valid = pd.read_csv('data/valid.csv',  delimiter=',')

In [17]:
train.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)
test.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)
valid.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)

In [18]:
import mlflow
from mlflow import sklearn

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('Experiment Sentiment Analysis')
mlflow.sklearn.autolog(log_datasets=False)
from mlflow.models import infer_signature
from sklearn.metrics import accuracy_score, f1_score, log_loss


def build_model(
    dataset,
    dataset_test,
    pipeline,
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None,):

    
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: dataset: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: model_name: name of the model as it will be stored in MLFlow
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run
    @param: mlflow_model_tags: dict of tags that will be stored in the MLFlow regietered model
    @param: mlflow_model_description: textual description of the model    
    @return: the ModelInfo of the model generated by MLFlow 

    """
    with mlflow.start_run():
        
        # Log parameters
        if mlflow_run_parameters is not None:
            for key, value in mlflow_run_parameters.items():
                mlflow.log_param(key, value)
        # Log tags
        if mlflow_run_tags is not None:
            for key, value in mlflow_run_tags.items():
                mlflow.set_tag(key, value)
        # Log description
        if mlflow_run_description is not None:
            mlflow.set_tag("description", mlflow_run_description)
        
   
      
        X_train = dataset['review']
  
        y_train = dataset['polarity']
        X_test = dataset_test['review']
        y_test = dataset_test['polarity']


        pipeline.fit(X_train, y_train)
        pred = pipeline.predict(X_test)

        # signature = infer_signature(X_test, pred)

        accuracy = accuracy_score(y_test, pred)
        f1 = f1_score(y_test, pred)

        mlflow.log_metric('Test accuracy', accuracy)
        mlflow.log_metric('Test f1  ', f1)

    return pipeline

In [29]:

setp1 = [('vectorizer', TfidfVectorizer(stop_words=stop)), ('lr', LogisticRegression(penalty='l2', C=1.0))]
step2 = [('vectorizer', TfidfVectorizer(stop_words=stop)), ('nb', MultinomialNB())]
pipelines = [Pipeline(setp1), Pipeline(step2)]
descriptions = ["Logistic Regression with penalty", "Naive Bayes"]
dataset = train
dataset_test = test

for pipeline, description in zip(pipelines, descriptions):

    build_model(dataset, dataset_test, pipeline, \
            mlflow_run_tags = None,
            mlflow_run_parameters = None,\
        mlflow_run_description  = description)
    


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:

X_train = train['review']
y_train = train['polarity']
X_test = test['review']
y_test = test['polarity']

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
def objective(params):
    """
    Objective function for hyperopt. Returns the negative accuracy of the model.
    @param: params: dict of parameters for the model
    @return: the loss of the model on the objective function
    """
    model = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words=stop)),
        ('lr', LogisticRegression(**params))
    ])
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    loss = log_loss(y_test, pred)
    return {'loss': loss, 'status': STATUS_OK} 

space = {}
space['solver'] = 'liblinear'
space['penalty'] = hp.choice('penalty', ['l1', 'l2'])
space['C'] = hp.uniform('C', 0.1, 10)

trials = Trials()
best_params = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials)


  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

2023/11/19 17:01:35 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3448a03290e344598b9b1801e2a49701', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow




 20%|██        | 1/5 [00:52<03:31, 52.89s/trial, best loss: 2.9699970392632538]

2023/11/19 17:02:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '11f6076da1534871b8559887c847096f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow




 40%|████      | 2/5 [01:48<02:42, 54.23s/trial, best loss: 2.9699970392632538]

2023/11/19 17:03:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4baa3cc6a03c4f52b2bd14e1b5310915', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow




 60%|██████    | 3/5 [02:44<01:50, 55.10s/trial, best loss: 2.9699970392632538]

2023/11/19 17:04:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3266ce0725334aeb93f59692edc55101', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow




 80%|████████  | 4/5 [03:51<00:59, 59.78s/trial, best loss: 2.780767858970388] 

2023/11/19 17:05:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '11e20da382cd4f34aaa269cd5a2058ad', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow




100%|██████████| 5/5 [04:42<00:00, 56.58s/trial, best loss: 2.780767858970388]


In [26]:
#instantiate the model with the best parameters
from hyperopt import space_eval
params = space_eval(space, best_params)

In [27]:
best_model = Pipeline([('vectorizer', TfidfVectorizer(stop_words=stop)),\
                        ('lr', LogisticRegression(**params))])


In [28]:
description = "Logistic Regression with best parameters"
dataset = train
dataset_test = test

build_model(dataset, dataset_test, best_model, \
            mlflow_run_tags = None,
            mlflow_run_parameters = None,\
        mlflow_run_description  = description)

