In [1]:
import spacy
import pandas as pd
from tqdm import tqdm
from spacy.tokens import DocBin

train_size = 100000
valid_size = 20000
nlp = spacy.load("en_core_web_sm")

In [2]:
# dataset: https://www.kaggle.com/datasets/kazanova/sentiment140
df = pd.read_csv(
    "training.1600000.processed.noemoticon.csv", 
    encoding='latin-1'
)
df.columns = ['sentiment', 'ids', 'date', 'flags', 'user', 'text']
df = df.drop(['ids', 'date', 'flags', 'user'], axis=1)
df.head()

Unnamed: 0,sentiment,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [3]:
df = df[df['sentiment'] != 2]
df['sentiment'] = df['sentiment'].map({0:'negative', 4:'positive'})
df = df[['text', 'sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,is upset that he can't update his Facebook by ...,negative
1,@Kenichan I dived many times for the ball. Man...,negative
2,my whole body feels itchy and like its on fire,negative
3,"@nationwideclass no, it's not behaving at all....",negative
4,@Kwesidei not the whole crew,negative


In [4]:
df = df.sample(frac=0.5)

In [5]:
df['sentiment'].value_counts()

sentiment
positive    400131
negative    399869
Name: count, dtype: int64

In [6]:
data = [tuple(df.iloc[i].values) for i in range(train_size + valid_size)]

In [7]:
df['sentiment'].value_counts()

sentiment
positive    400131
negative    399869
Name: count, dtype: int64

In [8]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if label == 'negative':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        # we need to set the (text)cat(egory) for each document
        #doc.cats["positive"] = label
        # put them into a nice list
        docs.append(doc)
    return docs

In [9]:
train_data = data[:train_size]
valid_data = data[train_size:train_size + valid_size]

# first we need to transform all the training data
train_docs = make_docs(train_data)

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [05:59<00:00, 278.23it/s]


In [10]:

# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

100%|██████████| 20000/20000 [00:37<00:00, 530.18it/s]


на этом месте мы идем в https://spacy.io/usage/training#quickstart, там настраиваем под себя конфиг (textcat), копируем его руками(!) в base_config.cfg, указываем правильные пути до трейн и вэлид


In [11]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg


[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python -m spacy train config.cfg --output ./output

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       41.29    0.41
  0     200         47.12       66.97    0.67
  0     400         43.19       69.66    0.70
  0     600         41.84       71.14    0.71
  0     800         38.49       72.86    0.73
  0    1000         38.58       73.74    0.74
  0    1200         37.17       74.89    0.75
  0    1400         36.13       75.50    0.76
  0    1600         34.62       76.10    0.76
  0    1800         34.51       76.36    0.76
  0    2000         34.37       76.49    0.76
  0    2200         33.44       77.18    0.77
  0    2400         32.96       77.46    0.77
  0    2600         31.98       77.66    0.78
  0    2800         32.36       77.65    0.78
  0    3000        

In [13]:
import spacy
# load thebest model from training
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(text)
    print(doc.cats)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")
    print()

type : ‘quit’ to exit
fuck you little runt
{'positive': 0.5936747193336487, 'negative': 0.4063253104686737}
the sentiment is positive

fuck
{'positive': 0.4792467951774597, 'negative': 0.5207532048225403}
the sentiment is negative

good afternoon
{'positive': 0.7237020134925842, 'negative': 0.27629798650741577}
the sentiment is positive

hello, gral to see you
{'positive': 0.8635736703872681, 'negative': 0.13642629981040955}
the sentiment is positive

hi, my name is Alexey
{'positive': 0.8005737662315369, 'negative': 0.19942623376846313}
the sentiment is positive

what a terrible condition you have
{'positive': 0.2905135154724121, 'negative': 0.7094864845275879}
the sentiment is negative

how are you doing, my friend
{'positive': 0.6532986164093018, 'negative': 0.34670135378837585}
the sentiment is positive

you dick
{'positive': 0.6553702354431152, 'negative': 0.34462976455688477}
the sentiment is positive

d
{'positive': 0.5631621479988098, 'negative': 0.4368378520011902}
the sentime