In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def df_from_20ng(subset):
    newsgroups_train  = fetch_20newsgroups(subset='train')
    ngdata = {"text": newsgroups_train.data, "target": newsgroups_train.target}
    df = pd.DataFrame.from_dict(ngdata)
    df['target_name'] = df.target.apply(lambda x: newsgroups_train.target_names[x])

    return df


df_train = df_from_20ng('train')
df_test = df_from_20ng('test')

X_train = df_train.drop(columns=['target','target_name'])
y_train = df_train['target_name']
X_test = df_test.drop(columns=['target','target_name'])
y_test = df_test['target_name']

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

ct = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000), 'text')
])

pipe = Pipeline([
    ('ctransformer', ct),
    ('clf', RandomForestClassifier(n_estimators=10, max_depth=20))
])

In [1]:
import mlflow
import mlflow.sklearn
import json
import os
import tempfile

from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from mlflow.models.signature import infer_signature

mlflow.set_experiment("My NLP Model")




with mlflow.start_run(run_name="TFIDF + Random Forest"):

    pipe.fit(X_train,y_train)

    y_pred = pipe.predict(X_test)

    mlflow.set_tag('client','That Email Company')

    signature = infer_signature(X_test, y_test)
    
    mlflow.log_metric('f1', f1_score(y_test, y_pred, average='micro'))
    mlflow.sklearn.log_model(pipe, "model", signature=signature)
    
    with tempfile.TemporaryDirectory() as tmpdir:
        
        report = classification_report(y_test, y_pred, output_dict=True)
        
        with open(os.path.join(tmpdir, "classification_report.json"),'w') as f:
            json.dump(report, f, indent=2)
            
        mlflow.log_artifacts(tmpdir, "reporting")

NameError: name 'pipe' is not defined

In [8]:
!curl --request POST --url http://127.0.0.1:5000/invocations --header 'Content-Type: application/json; format=pandas-records' --data '[{"text":"hey, I have an old bicycle for sale in the Southampton area"}]'

This predictor only supports the following content types and formats: Types: ['text/csv', 'application/json', 'application/json-numpy-split']; Formats: ['pandas-records', 'pandas-split']. Got 'application/x-www-form-urlencoded'.


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   298  100   228  100    70   200k  63006 --:--:-- --:--:-- --:--:--  291k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0curl: (6) Could not resolve host: application
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0 

In [None]:
curl -X POST http://127.0.0.1:1234/invocations -H "Content-Type:application/json; format=pandas-split" --data '{"columns":["alcohol", "chlorides", "citric acid", "density", "fixed acidity", "free sulfur dioxide", "pH", "residual sugar", "sulphates", "total sulfur dioxide", "volatile acidity"],"data":[[12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66]]}' 