In [1]:
# Importing libraries
import pandas as pd
from datetime import datetime
import spacy
import numpy as np
# Storing docs in binary format
from spacy.tokens import DocBin

In [66]:
# Reading the dataset
df = pd.read_csv("Data1.csv", encoding='latin-1')
df['Sentiment'] = ['negative' if x == 'negative' else 'positive_neutral' for x in df['Sentiment']]

In [67]:
df['Sentiment']

0       positive_neutral
1       positive_neutral
2               negative
3       positive_neutral
4       positive_neutral
              ...       
4841            negative
4842    positive_neutral
4843            negative
4844            negative
4845            negative
Name: Sentiment, Length: 4846, dtype: object

In [68]:
#Splitting the dataset into train and test
train = df.sample(frac = 0.8, random_state = 25)
test = df.drop(train.index)
nlp=spacy.load("en_core_web_md")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [69]:
train['tuples'] = train.apply(lambda row: (row['Text'],row['Sentiment']), axis=1)
train = train['tuples'].tolist()
test['tuples'] = test.apply(lambda row: (row['Text'],row['Sentiment']), axis=1)
test = test['tuples'].tolist()

In [70]:
def document(data):
    text = []
    for doc, label in nlp.pipe(data, as_tuples = True):
        if (label=='negative'):
            doc.cats['negative'] = 1
            doc.cats['positive_neutral'] = 0
        else:
            doc.cats['positive_neutral'] = 1
            doc.cats['negative'] = 0
        
        text.append(doc)
  
    return(text)

In [75]:
# Calculate the time for converting into binary document for train dataset

start_time = datetime.now()

#passing the train dataset into function 'document'
train_docs = document(train)

#Creating binary document using DocBin function in spaCy
doc_bin = DocBin(docs = train_docs)

#Saving the binary document as train.spacy
doc_bin.to_disk("train.spacy")
end_time = datetime.now()

#Printing the time duration for train dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:09.989443


In [76]:
# Calculate the time for converting into binary document for test dataset

start_time = datetime.now()

#passing the test dataset into function 'document'
test_docs = document(test)
doc_bin = DocBin(docs = test_docs)
doc_bin.to_disk("valid.spacy")
end_time = datetime.now()

#Printing the time duration for test dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:02.511299


In [84]:
!python -m spacy init fill-config base_config.cfg config.cfg
 

Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE]
Try 'python -m spacy init fill-config --help' for help.

Error: Invalid value for 'BASE_PATH': File 'base-config.cfg' does not exist.


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
start_time = datetime.now()

!python -m spacy train config.cfg --verbose  --output ./output_updated --paths.train train.spacy --paths.dev valid.spacy

end_time = datetime.now()

print('Duration: {}'.format(end_time - start_time))

[i] Saving to output directory: output_updated
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'textcat_multilabel']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS TEXTC...  CATS_MACRO_F  SCORE 
---  ------  ------------  -------------  ------------  ------
  0       0          0.00           0.28         45.57    0.46
  1     200          0.00          17.09         81.52    0.82
  3     400          0.00           8.80         84.95    0.85
  4     600          0.00           4.45         81.34    0.81
  6     800          0.00           2.42         82.30    0.82
  8    1000          0.00           1.97         80.04    0.80
  9    1200          0.00           1.17         80.01    0.80
 11    1400          0.00           0.78         82.21    0.82
 13    1600          0.00           0.82         81.96    0.82
 14    1800          0.00           0.54         78.44    0.78
 16    2000          0.00           0.63         80.18    0.80
[+] Sav

[2022-08-25 12:17:53,549] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2022-08-25 12:17:53,769] [INFO] Set up nlp object from config
[2022-08-25 12:17:53,779] [DEBUG] Loading corpus from path: valid.spacy
[2022-08-25 12:17:53,779] [DEBUG] Loading corpus from path: train.spacy
[2022-08-25 12:17:53,780] [DEBUG] Loading corpus from path: train.spacy
[2022-08-25 12:17:53,780] [INFO] Pipeline: ['tok2vec', 'textcat_multilabel']
[2022-08-25 12:17:53,785] [INFO] Created vocabulary
[2022-08-25 12:17:53,786] [INFO] Finished initializing nlp object
[2022-08-25 12:17:56,090] [INFO] Initialized pipeline components: ['tok2vec', 'textcat_multilabel']
[2022-08-25 12:17:56,099] [DEBUG] Loading corpus from path: valid.spacy
[2022-08-25 12:17:56,100] [DEBUG] Loading corpus from path: train.spacy
[2022-08-25 12:17:56,101] [DEBUG] Loading corpus from path: train.spacy
[2022-08-25 12:17:56,116] [DEBUG] Removed existing output directory: output_updated\model-best
[2022-08-25 12:17:56,130]

In [7]:
import mlflow
import mlflow.spacy
import spacy
from datetime import date

today = date.today()
print("Today's date:", today)

mlflow.set_experiment(experiment_name="Spacy Model")

# MLflow Tracking
nlp = spacy.load('output_updated\model-last')
with mlflow.start_run(run_name=f'Spacy_{today}_workin'):
    mlflow.set_tag('model_flavor', 'spacy')
    mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model')
    mlflow.log_metric('accuracy',0.72)
    my_run_id = mlflow.active_run().info.run_id


# MLflow Models
model_uri = f'runs:/{my_run_id}/model'
nlp2 = mlflow.spacy.load_model(model_uri=model_uri)

Today's date: 2022-08-25




In [21]:
doc = nlp2("Barclays lost money")
doc.cats

{'positive_neutral': 0.005847441032528877, 'negative': 0.987448513507843}

In [117]:
!curl -X POST -H "Content-Type:application/json; format=pandas-split" --data "{\"columns\":[\"text\"],\"data\":[\"There is a negative new\"]}" http://127.0.0.1:1234/invocations


[{"predictions": {"positive_neutral": 0.9982155561447144, "negative": 0.0021095010451972485}}]


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   149  100    94  100    55   5875   3437 --:--:-- --:--:-- --:--:--  9312
