In [1]:
import spacy
import pandas as pd
from tqdm import tqdm
from spacy.tokens import DocBin

train_size = 50000
valid_size = 20000
nlp = spacy.load("en_core_web_sm")

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset: https://www.kaggle.com/datasets/kazanova/sentiment140
df = pd.read_csv("archive/training.1600000.processed.noemoticon.csv", encoding='latin-1')
df = df.drop(['ids', 'date', 'flags', 'user'], axis=1)
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [3]:
df = df[df['sentiment'] != 2]
df['sentiment'] = df['sentiment'].map({0:'negative', 4:'positive'})
df = df[['text', 'sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,is upset that he can't update his Facebook by ...,negative
2,@Kenichan I dived many times for the ball. Man...,negative
3,my whole body feels itchy and like its on fire,negative
4,"@nationwideclass no, it's not behaving at all....",negative


In [4]:
df = df.sample(frac=0.5)

In [5]:
df['sentiment'].value_counts()

negative    400650
positive    399350
Name: sentiment, dtype: int64

In [6]:
data = [tuple(df.iloc[i].values) for i in range(train_size + valid_size)]

In [7]:
df['sentiment'].value_counts()

negative    400650
positive    399350
Name: sentiment, dtype: int64

In [8]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if label == 'negative':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        # we need to set the (text)cat(egory) for each document
        #doc.cats["positive"] = label
        # put them into a nice list
        docs.append(doc)
    return docs

In [9]:
train_data = data[:train_size]
valid_data = data[train_size:train_size + valid_size]

# first we need to transform all the training data
train_docs = make_docs(train_data)

100%|██████████| 50000/50000 [01:20<00:00, 617.73it/s]


In [10]:

# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

100%|██████████| 20000/20000 [00:30<00:00, 658.81it/s]


на этом месте мы идем в https://spacy.io/usage/training#quickstart, там настраиваем под себя конфиг (textcat), копируем его руками(!) в base_config.cfg, указываем правильные пути до трейн и вэлид


In [11]:
!python3 -m spacy init fill-config ./base_config.cfg ./config.cfg


  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python3 -m spacy train config.cfg --output ./output

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       51.68    0.52
  0     200         46.01       65.83    0.66
  0     400         42.59       69.84    0.70
  0     600         40.88       70.38    0.70
  0     800         39.20       72.47    0.72
  0    1000         37.90       73.23    0.73
  0    1200         36.65       74.25    0.74
  0    1400         36.39       75.09    0.75
  0    1600         35.57       75.57    0.76
  0    1800         34.56       76.42    0.76
  0    2000         33.63       76.58    0.77
  0    2200         33.26       77.27    0.77
  1    2400         27.69       77.23    0.77
  1    2600         26.76       77.4

In [14]:
import spacy
# load thebest model from training
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(text)
    print(doc.cats)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")
    print()

type : ‘quit’ to exit
get out of here you little pice of crap
{'positive': 0.3781455159187317, 'negative': 0.6218544840812683}
the sentiment is negative

happy birthday Samantha
{'positive': 0.791291356086731, 'negative': 0.20870865881443024}
the sentiment is positive

He looks great today
{'positive': 0.775345504283905, 'negative': 0.22465446591377258}
the sentiment is positive

Scientists did fantastic job making him better at cooking
{'positive': 0.8339892029762268, 'negative': 0.1660108119249344}
the sentiment is positive

How about you keep your mouth shut?
{'positive': 0.6757451891899109, 'negative': 0.3242548108100891}
the sentiment is positive

Touch grass discord mod
{'positive': 0.5015324950218201, 'negative': 0.4984675347805023}
the sentiment is positive

Crap - how terrible to see him smiling over his friend's death
{'positive': 0.32396501302719116, 'negative': 0.6760349869728088}
the sentiment is negative

quit
{'positive': 0.5347508788108826, 'negative': 0.465249091386795