In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load data

In [19]:
train = pd.read_csv('data/train.csv', delimiter=',')
test = pd.read_csv('data/test.csv',  delimiter=',')
valid = pd.read_csv('data/valid.csv',  delimiter=',')

In [20]:
train.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)
test.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)
valid.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)

In [28]:
import mlflow
from mlflow import sklearn

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('Experiment Sentiment Analysis')
mlflow.sklearn.autolog(log_datasets=False)
from mlflow.models import infer_signature
from sklearn.metrics import accuracy_score, f1_score


def build_model(
    dataset,
    dataset_test,
    pipeline,
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None,):

    
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: dataset: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: model_name: name of the model as it will be stored in MLFlow
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run
    @param: mlflow_model_tags: dict of tags that will be stored in the MLFlow regietered model
    @param: mlflow_model_description: textual description of the model    
    @return: the ModelInfo of the model generated by MLFlow 

    """
    with mlflow.start_run():
        
        # Log parameters
        if mlflow_run_parameters is not None:
            for key, value in mlflow_run_parameters.items():
                mlflow.log_param(key, value)
        # Log tags
        if mlflow_run_tags is not None:
            for key, value in mlflow_run_tags.items():
                mlflow.set_tag(key, value)
        # Log description
        if mlflow_run_description is not None:
            mlflow.set_tag("description", mlflow_run_description)
        
   
      
        X_train = dataset['review']
  
        y_train = dataset['polarity']
        X_test = dataset_test['review']
        y_test = dataset_test['polarity']


        pipeline.fit(X_train, y_train)
        pred = pipeline.predict(X_test)

        # signature = infer_signature(X_test, pred)

        accuracy = accuracy_score(y_test, pred)
        f1 = f1_score(y_test, pred)

        mlflow.log_metric('Test accuracy', accuracy)
        mlflow.log_metric('Test f1  ', f1)

    return pipeline

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
stop = list(fr_stop)
setp1 = [('vectorizer', TfidfVectorizer(stop_words=stop)), ('lr', LogisticRegression(penalty='l2', C=1.0))]
step2 = [('vectorizer', TfidfVectorizer(stop_words=stop)), ('nb', MultinomialNB())]
pipelines = [Pipeline(setp1), Pipeline(step2)]
descriptions = ["Logistic Regression with penalty", "Naive Bayes"]
dataset = train
dataset_test = test

for pipeline, description in zip(pipelines, descriptions):

    build_model(dataset, dataset_test, pipeline, \
            mlflow_run_tags = None,
            mlflow_run_parameters = None,\
        mlflow_run_description  = description)
    


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
