In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load data

In [6]:
train = pd.read_csv('data/train.csv', delimiter=',')
test = pd.read_csv('data/test.csv',  delimiter=',')
valid = pd.read_csv('data/valid.csv',  delimiter=',')

In [7]:
train.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)
test.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)
valid.drop(['Unnamed: 0', "film-url"], axis=1, inplace=True)

In [8]:
train.head()

Unnamed: 0,review,polarity
0,Si vous cherchez du cinéma abrutissant à tous ...,0
1,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,"Et si, dans les 5 premières minutes du film, l...",0
3,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,"Premier film de la saga Kozure Okami, ""Le Sabr...",1


In [9]:
import mlflow
from mlflow import sklearn

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('Sentiment Analysis')
mlflow.sklearn.autolog(log_datasets=False)
from mlflow.models import infer_signature

with mlflow.start_run():
    from sklearn.linear_model import LogisticRegression
    from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
    from sklearn.pipeline import Pipeline
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics import accuracy_score, f1_score
    stop = list(fr_stop)
    vectorizer = TfidfVectorizer(stop_words=stop)
    step = [('vectorizer', TfidfVectorizer(stop_words=stop)), ('lr', LogisticRegression())]
    pipe = Pipeline(step)
    X_train = train['review']
    y_train = train['polarity']
    X_valid = valid['review']
    y_valid = valid['polarity']

    tags ={"model": "logistic regression", 
           "version": "0.1.0","mlflow.source.git": "commit", 
           "mlflow.source.name": "model_design_2.ipynb",
           "mlflow.source.git.commit": "a0d889471c217fb26529ef0df970a5cefee4434d",
           "mlflow.source.dataname": "data/train.csv"}

    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_valid)

    signature = infer_signature(X_valid, pred)

    accuracy = accuracy_score(y_valid, pred)
    f1 = f1_score(y_valid, pred)

    mlflow.log_metric('validation accuracy', accuracy)
    mlflow.log_metric('validation f1  ', f1)
    mlflow.set_tags(tags)


    mlflow.sklearn.log_model(
        sk_model=pipe,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="logistic-regression",
    )




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Successfully registered model 'logistic-regression'.
2023/11/14 16:17:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic-regression, version 1
Created version '1' of model 'logistic-regression'.
