In [1]:
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.

For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import pandas as pd
from spacy.util import decaying
from tqdm import tqdm
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def train(model=None, output_dir=None, n_iter=200, label="Loaded_Language"):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    dropout = decaying(0.6, 0.2, 1e-4)

    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in tqdm(range(n_iter)):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=next(dropout),  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

   
    output_dir = "spacy/model"
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [33]:
from collections import defaultdict
import pathlib
import spacy
from spacy.tokens import Doc, Token
from torch import LongTensor
nlp = spacy.load('en')

from pathlib import Path

def read_data(directory, isLabels = True):
    ids = []
    texts = []
    labels = []
    for f in directory.glob('*.txt'):
        id = f.name.replace('article', '').replace('.txt','')
        ids.append(id)
        texts.append(f.read_text(encoding='utf-8'))
        if isLabels:
            labels.append(parse_label(f.as_posix().replace('.txt', '.task3.labels')))
    docs = list(nlp.pipe(texts))
    return ids, docs, labels

def parse_label(label_path):
    # idx, type, start, end
    labels = []
    f= Path(label_path)
    if not f.exists():
        return labels
    for line in open(label_path):
        parts = line.strip().split('\t')
        if ((parts[1]) == "Loaded_Language"):
            #print(parts[1])
            labels.append((int(parts[2]), int(parts[3]), parts[1]))
    return sorted(labels)

PROPAGANDA_TYPES = [
    "O",
    "Appeal_to_Authority",
    "Appeal_to_fear-prejudice",
    "Bandwagon",
    "Black-and-White_Fallacy",
    "Causal_Oversimplification",
    "Doubt",
    "Exaggeration,Minimisation",
    "Flag-Waving",
    "Loaded_Language",
    "Name_Calling,Labeling",
    "Obfuscation,Intentional_Vagueness,Confusion",
    "Red_Herring",
    "Reductio_ad_hitlerum",
    "Repetition",
    "Slogans",
    "Straw_Men",
    "Thought-terminating_Cliches",
    "Whataboutism",
    "B-Appeal_to_Authority",
    "B-Appeal_to_fear-prejudice",
    "B-Bandwagon",
    "B-Black-and-White_Fallacy",
    "B-Causal_Oversimplification",
    "B-Doubt",
    "B-Exaggeration,Minimisation",
    "B-Flag-Waving",
    "B-Loaded_Language",
    "B-Name_Calling,Labeling",
    "B-Obfuscation,Intentional_Vagueness,Confusion",
    "B-Red_Herring",
    "B-Reductio_ad_hitlerum",
    "B-Repetition",
    "B-Slogans",
    "B-Straw_Men",
    "B-Thought-terminating_Cliches",
    "B-Whataboutism"
]

PT2ID = {y: x for (x, y) in enumerate(PROPAGANDA_TYPES)}



In [41]:

directory = pathlib.Path('../task3/datasets-v5/tasks-2-3/test/')
ids, texts, labels = read_data(directory)

idnew = []
textnew = []
idn = []
TRAIN_DATA = []
for x, y, idx in zip(texts, labels, ids):
    if y:    
        idn.append(idx)
        textnew.append(y)
        new_dict = {"entities":y}
        tlist = [str(x), new_dict]
        TRAIN_DATA.append(tlist)


In [3]:
train(label = "Loaded_Language")

Created blank 'en' model


  0%|          | 1/200 [00:19<1:06:08, 19.94s/it]

Losses {'ner': 1048.464724261932}


  1%|          | 2/200 [00:38<1:04:20, 19.50s/it]

Losses {'ner': 268.042424036995}


  2%|▏         | 3/200 [00:56<1:03:00, 19.19s/it]

Losses {'ner': 211.02968073415374}


  2%|▏         | 4/200 [01:15<1:02:00, 18.98s/it]

Losses {'ner': 187.70705981722313}


  2%|▎         | 5/200 [01:33<1:01:11, 18.83s/it]

Losses {'ner': 172.0565302780451}


  3%|▎         | 6/200 [01:52<1:00:36, 18.75s/it]

Losses {'ner': 167.4834246199116}


  4%|▎         | 7/200 [02:10<1:00:06, 18.69s/it]

Losses {'ner': 164.85295867285186}


  4%|▍         | 8/200 [02:29<59:35, 18.62s/it]  

Losses {'ner': 163.50577611383142}


  4%|▍         | 9/200 [02:47<59:08, 18.58s/it]

Losses {'ner': 160.17380383903026}


  5%|▌         | 10/200 [03:06<58:41, 18.53s/it]

Losses {'ner': 159.13814214457204}


  6%|▌         | 11/200 [03:24<58:17, 18.50s/it]

Losses {'ner': 153.18158692923566}


  6%|▌         | 12/200 [03:43<57:53, 18.48s/it]

Losses {'ner': 150.82139112343532}


  6%|▋         | 13/200 [04:01<57:30, 18.45s/it]

Losses {'ner': 150.67189189560077}


  7%|▋         | 14/200 [04:19<57:06, 18.42s/it]

Losses {'ner': 144.13422651952794}


  8%|▊         | 15/200 [04:38<56:44, 18.40s/it]

Losses {'ner': 144.2051796039687}


  8%|▊         | 16/200 [04:58<58:15, 19.00s/it]

Losses {'ner': 140.58557825653037}


  8%|▊         | 17/200 [05:18<59:08, 19.39s/it]

Losses {'ner': 136.2022436467198}


  9%|▉         | 18/200 [05:37<58:26, 19.27s/it]

Losses {'ner': 134.01771939853208}


 10%|▉         | 19/200 [05:57<58:04, 19.25s/it]

Losses {'ner': 129.13807520039114}


 10%|█         | 20/200 [06:16<57:38, 19.21s/it]

Losses {'ner': 130.3436116171508}


 10%|█         | 21/200 [06:36<58:26, 19.59s/it]

Losses {'ner': 126.54670015610863}


 11%|█         | 22/200 [06:57<59:04, 19.91s/it]

Losses {'ner': 123.38691933699799}


 12%|█▏        | 23/200 [07:17<58:37, 19.87s/it]

Losses {'ner': 120.41848251481167}


 12%|█▏        | 24/200 [07:36<57:51, 19.72s/it]

Losses {'ner': 119.0087340519154}


 12%|█▎        | 25/200 [07:55<57:03, 19.56s/it]

Losses {'ner': 115.88331804497622}


 13%|█▎        | 26/200 [08:15<56:51, 19.60s/it]

Losses {'ner': 113.66890166468536}


 14%|█▎        | 27/200 [08:36<57:52, 20.07s/it]

Losses {'ner': 111.83220834035751}


 14%|█▍        | 28/200 [08:57<58:15, 20.32s/it]

Losses {'ner': 104.90650877940105}


 14%|█▍        | 29/200 [09:16<56:22, 19.78s/it]

Losses {'ner': 99.28041443015869}


 15%|█▌        | 30/200 [09:34<55:03, 19.43s/it]

Losses {'ner': 102.06861685021431}


 16%|█▌        | 31/200 [09:53<54:12, 19.25s/it]

Losses {'ner': 96.40874687990828}


 16%|█▌        | 32/200 [10:11<53:10, 18.99s/it]

Losses {'ner': 92.82458110462835}


 16%|█▋        | 33/200 [10:30<52:24, 18.83s/it]

Losses {'ner': 92.30454866379023}


 17%|█▋        | 34/200 [10:48<51:54, 18.76s/it]

Losses {'ner': 90.65881635547206}


 18%|█▊        | 35/200 [11:07<51:27, 18.71s/it]

Losses {'ner': 87.51894023510557}


 18%|█▊        | 36/200 [11:26<51:02, 18.67s/it]

Losses {'ner': 81.19708812469145}


 18%|█▊        | 37/200 [11:44<50:39, 18.65s/it]

Losses {'ner': 79.23298492994005}


 19%|█▉        | 38/200 [12:03<50:20, 18.64s/it]

Losses {'ner': 81.83838079966026}


 20%|█▉        | 39/200 [12:21<50:01, 18.64s/it]

Losses {'ner': 72.40499946517534}


 20%|██        | 40/200 [12:40<49:41, 18.63s/it]

Losses {'ner': 73.51490769587411}


 20%|██        | 41/200 [12:59<49:25, 18.65s/it]

Losses {'ner': 71.65435759768546}


 21%|██        | 42/200 [13:17<49:10, 18.67s/it]

Losses {'ner': 66.47763114139919}


 22%|██▏       | 43/200 [13:36<48:58, 18.72s/it]

Losses {'ner': 64.28415765934352}


 22%|██▏       | 44/200 [13:55<48:52, 18.80s/it]

Losses {'ner': 66.10515300671481}


 22%|██▎       | 45/200 [14:14<48:41, 18.85s/it]

Losses {'ner': 58.71135610993863}


 23%|██▎       | 46/200 [14:33<48:34, 18.92s/it]

Losses {'ner': 61.123695594838956}


 24%|██▎       | 47/200 [14:53<48:45, 19.12s/it]

Losses {'ner': 59.766344675223905}


 24%|██▍       | 48/200 [15:12<48:38, 19.20s/it]

Losses {'ner': 58.0797355346647}


 24%|██▍       | 49/200 [15:31<48:16, 19.19s/it]

Losses {'ner': 53.96083570401865}


 25%|██▌       | 50/200 [15:51<48:00, 19.20s/it]

Losses {'ner': 51.709781919640925}


 26%|██▌       | 51/200 [16:10<47:39, 19.19s/it]

Losses {'ner': 50.616987901984736}


 26%|██▌       | 52/200 [16:31<48:24, 19.63s/it]

Losses {'ner': 55.15175527745243}


 26%|██▋       | 53/200 [16:52<49:35, 20.24s/it]

Losses {'ner': 51.259390009168946}


 27%|██▋       | 54/200 [17:15<51:11, 21.04s/it]

Losses {'ner': 46.721642854963925}


 28%|██▊       | 55/200 [17:38<52:26, 21.70s/it]

Losses {'ner': 47.065971905949496}


 28%|██▊       | 56/200 [18:01<53:04, 22.11s/it]

Losses {'ner': 44.67312799696229}


 28%|██▊       | 57/200 [18:24<53:19, 22.38s/it]

Losses {'ner': 41.04682662383316}


 29%|██▉       | 58/200 [18:49<54:36, 23.07s/it]

Losses {'ner': 38.99768948245766}


 30%|██▉       | 59/200 [19:13<54:26, 23.17s/it]

Losses {'ner': 44.57568415970128}


 30%|███       | 60/200 [19:35<53:37, 22.98s/it]

Losses {'ner': 41.072643276684495}


 30%|███       | 61/200 [19:55<51:07, 22.07s/it]

Losses {'ner': 38.74952295724137}


 31%|███       | 62/200 [20:14<48:20, 21.01s/it]

Losses {'ner': 36.55581304605464}


 32%|███▏      | 63/200 [20:32<46:13, 20.24s/it]

Losses {'ner': 33.52595945905138}


 32%|███▏      | 64/200 [20:50<44:37, 19.69s/it]

Losses {'ner': 34.32006874659133}


 32%|███▎      | 65/200 [21:09<43:30, 19.34s/it]

Losses {'ner': 34.57586977319176}


 33%|███▎      | 66/200 [21:27<42:36, 19.08s/it]

Losses {'ner': 36.22120102802184}


 34%|███▎      | 67/200 [21:46<41:58, 18.93s/it]

Losses {'ner': 34.09491180137588}


 34%|███▍      | 68/200 [22:04<41:22, 18.81s/it]

Losses {'ner': 31.858540616171524}


 34%|███▍      | 69/200 [22:23<40:50, 18.71s/it]

Losses {'ner': 30.280907169077317}


 35%|███▌      | 70/200 [22:41<40:23, 18.64s/it]

Losses {'ner': 28.059329424006783}


 36%|███▌      | 71/200 [23:00<40:07, 18.66s/it]

Losses {'ner': 27.725436719745993}


 36%|███▌      | 72/200 [23:19<40:05, 18.79s/it]

Losses {'ner': 25.910079410431695}


 36%|███▋      | 73/200 [23:39<40:09, 18.97s/it]

Losses {'ner': 26.14361572033701}


 37%|███▋      | 74/200 [23:58<40:18, 19.19s/it]

Losses {'ner': 25.6130621968784}


 38%|███▊      | 75/200 [24:18<40:31, 19.45s/it]

Losses {'ner': 22.657918211299233}


 38%|███▊      | 76/200 [24:39<40:43, 19.70s/it]

Losses {'ner': 24.748576141273414}


 38%|███▊      | 77/200 [24:59<40:57, 19.98s/it]

Losses {'ner': 27.01057387957754}


 39%|███▉      | 78/200 [25:20<41:07, 20.22s/it]

Losses {'ner': 24.58286631032232}


 40%|███▉      | 79/200 [25:41<41:12, 20.43s/it]

Losses {'ner': 26.575078056307703}


 40%|████      | 80/200 [26:02<41:13, 20.61s/it]

Losses {'ner': 24.01859720996199}


 40%|████      | 81/200 [26:23<41:04, 20.71s/it]

Losses {'ner': 24.016006673451564}


 41%|████      | 82/200 [26:44<40:59, 20.85s/it]

Losses {'ner': 24.636315120209545}


 42%|████▏     | 83/200 [27:05<40:46, 20.91s/it]

Losses {'ner': 21.672845683424377}


 42%|████▏     | 84/200 [27:26<40:27, 20.93s/it]

Losses {'ner': 20.330661828954447}


 42%|████▎     | 85/200 [27:47<40:08, 20.94s/it]

Losses {'ner': 21.211604145251926}


 43%|████▎     | 86/200 [28:08<39:52, 20.98s/it]

Losses {'ner': 20.19263118880356}


 44%|████▎     | 87/200 [28:29<39:33, 21.00s/it]

Losses {'ner': 21.27376783222458}


 44%|████▍     | 88/200 [28:50<39:11, 21.00s/it]

Losses {'ner': 20.161913688068687}


 44%|████▍     | 89/200 [29:11<38:50, 20.99s/it]

Losses {'ner': 19.109434946579228}


 45%|████▌     | 90/200 [29:32<38:25, 20.96s/it]

Losses {'ner': 16.740006155695443}


 46%|████▌     | 91/200 [29:53<38:07, 20.98s/it]

Losses {'ner': 19.33611549291136}


 46%|████▌     | 92/200 [30:14<37:48, 21.00s/it]

Losses {'ner': 15.85143068035701}


 46%|████▋     | 93/200 [30:35<37:26, 20.99s/it]

Losses {'ner': 16.78197391483551}


 47%|████▋     | 94/200 [30:56<37:05, 20.99s/it]

Losses {'ner': 17.775272244500755}


 48%|████▊     | 95/200 [31:17<36:45, 21.00s/it]

Losses {'ner': 15.065164656060402}


 48%|████▊     | 96/200 [31:38<36:20, 20.97s/it]

Losses {'ner': 20.619207853025927}


 48%|████▊     | 97/200 [31:59<35:59, 20.96s/it]

Losses {'ner': 14.93192188535756}


 49%|████▉     | 98/200 [32:20<35:37, 20.96s/it]

Losses {'ner': 16.28507729272628}


 50%|████▉     | 99/200 [32:41<35:17, 20.96s/it]

Losses {'ner': 15.971288146705053}


 50%|█████     | 100/200 [33:02<34:58, 20.99s/it]

Losses {'ner': 15.373879204690267}


 50%|█████     | 101/200 [33:23<34:41, 21.02s/it]

Losses {'ner': 16.977942768883867}


 51%|█████     | 102/200 [33:44<34:21, 21.04s/it]

Losses {'ner': 14.907488318570818}


 52%|█████▏    | 103/200 [34:05<33:59, 21.03s/it]

Losses {'ner': 15.013209997765072}


 52%|█████▏    | 104/200 [34:26<33:39, 21.03s/it]

Losses {'ner': 16.633307972226877}


 52%|█████▎    | 105/200 [34:47<33:21, 21.07s/it]

Losses {'ner': 14.337989209207882}


 53%|█████▎    | 106/200 [35:09<33:04, 21.11s/it]

Losses {'ner': 14.255448174905547}


 54%|█████▎    | 107/200 [35:30<32:39, 21.07s/it]

Losses {'ner': 12.43386030304055}


 54%|█████▍    | 108/200 [35:51<32:15, 21.04s/it]

Losses {'ner': 12.646454728495979}


 55%|█████▍    | 109/200 [36:12<32:00, 21.11s/it]

Losses {'ner': 12.233337626621509}


 55%|█████▌    | 110/200 [36:33<31:36, 21.07s/it]

Losses {'ner': 13.124219836098518}


 56%|█████▌    | 111/200 [36:54<31:12, 21.04s/it]

Losses {'ner': 12.71213231200662}


 56%|█████▌    | 112/200 [37:15<30:51, 21.04s/it]

Losses {'ner': 12.939195266826163}


 56%|█████▋    | 113/200 [37:36<30:31, 21.05s/it]

Losses {'ner': 12.445314689957243}


 57%|█████▋    | 114/200 [37:57<30:07, 21.02s/it]

Losses {'ner': 11.988001447253612}


 57%|█████▊    | 115/200 [38:18<29:44, 21.00s/it]

Losses {'ner': 12.03247641977478}


 58%|█████▊    | 116/200 [38:39<29:25, 21.01s/it]

Losses {'ner': 12.903073944063086}


 58%|█████▊    | 117/200 [39:00<29:04, 21.02s/it]

Losses {'ner': 14.155239925785182}


 59%|█████▉    | 118/200 [39:21<28:47, 21.07s/it]

Losses {'ner': 11.041789620959397}


 60%|█████▉    | 119/200 [39:42<28:32, 21.15s/it]

Losses {'ner': 11.922151442157489}


 60%|██████    | 120/200 [40:03<28:10, 21.13s/it]

Losses {'ner': 11.70549233543972}


 60%|██████    | 121/200 [40:25<27:52, 21.17s/it]

Losses {'ner': 10.774850460794005}


 61%|██████    | 122/200 [40:46<27:36, 21.23s/it]

Losses {'ner': 10.833121820831481}


 62%|██████▏   | 123/200 [41:07<27:16, 21.25s/it]

Losses {'ner': 11.01442683892688}


 62%|██████▏   | 124/200 [41:28<26:50, 21.19s/it]

Losses {'ner': 10.581273430616788}


 62%|██████▎   | 125/200 [41:49<26:25, 21.14s/it]

Losses {'ner': 10.566036255056956}


 63%|██████▎   | 126/200 [42:10<25:58, 21.05s/it]

Losses {'ner': 10.03282661963102}


 64%|██████▎   | 127/200 [42:31<25:35, 21.04s/it]

Losses {'ner': 11.804257704114486}


 64%|██████▍   | 128/200 [42:52<25:11, 20.99s/it]

Losses {'ner': 10.478929582255246}


 64%|██████▍   | 129/200 [43:13<24:49, 20.97s/it]

Losses {'ner': 8.164574584145598}


 65%|██████▌   | 130/200 [43:34<24:28, 20.98s/it]

Losses {'ner': 11.161701941379356}


 66%|██████▌   | 131/200 [43:55<24:06, 20.97s/it]

Losses {'ner': 10.826890984655032}


 66%|██████▌   | 132/200 [44:16<23:46, 20.98s/it]

Losses {'ner': 8.65759845969682}


 66%|██████▋   | 133/200 [44:37<23:26, 21.00s/it]

Losses {'ner': 9.740933093421678}


 67%|██████▋   | 134/200 [44:58<23:05, 20.99s/it]

Losses {'ner': 9.654359571314739}


 68%|██████▊   | 135/200 [45:19<22:42, 20.97s/it]

Losses {'ner': 10.54388606961424}


 68%|██████▊   | 136/200 [45:40<22:20, 20.94s/it]

Losses {'ner': 10.316789188698593}


 68%|██████▊   | 137/200 [46:01<21:58, 20.93s/it]

Losses {'ner': 9.340421091990779}


 69%|██████▉   | 138/200 [46:22<21:38, 20.94s/it]

Losses {'ner': 9.896169591200993}


 70%|██████▉   | 139/200 [46:43<21:17, 20.95s/it]

Losses {'ner': 9.25399959459744}


 70%|███████   | 140/200 [47:04<20:57, 20.96s/it]

Losses {'ner': 9.379932796581048}


 70%|███████   | 141/200 [47:25<20:36, 20.97s/it]

Losses {'ner': 10.21959299303016}


 71%|███████   | 142/200 [47:46<20:15, 20.95s/it]

Losses {'ner': 9.80596370918778}


 72%|███████▏  | 143/200 [48:06<19:52, 20.92s/it]

Losses {'ner': 9.592407496277653}


 72%|███████▏  | 144/200 [48:27<19:31, 20.92s/it]

Losses {'ner': 9.539601353711639}


 72%|███████▎  | 145/200 [48:48<19:12, 20.95s/it]

Losses {'ner': 8.400271188752509}


 73%|███████▎  | 146/200 [49:09<18:50, 20.93s/it]

Losses {'ner': 9.045137805399584}


 74%|███████▎  | 147/200 [49:30<18:29, 20.93s/it]

Losses {'ner': 9.842402659590405}


 74%|███████▍  | 148/200 [49:51<18:13, 21.03s/it]

Losses {'ner': 8.266409579305677}


 74%|███████▍  | 149/200 [50:12<17:51, 21.01s/it]

Losses {'ner': 8.414301068670905}


 75%|███████▌  | 150/200 [50:33<17:30, 21.00s/it]

Losses {'ner': 8.01509803131098}


 76%|███████▌  | 151/200 [50:54<17:09, 21.00s/it]

Losses {'ner': 6.667833725886291}


 76%|███████▌  | 152/200 [51:16<16:49, 21.03s/it]

Losses {'ner': 6.901134085064466}


 76%|███████▋  | 153/200 [51:36<16:26, 20.99s/it]

Losses {'ner': 8.766218013659648}


 77%|███████▋  | 154/200 [51:58<16:07, 21.03s/it]

Losses {'ner': 7.709330199112928}


 78%|███████▊  | 155/200 [52:19<15:45, 21.02s/it]

Losses {'ner': 7.04141831514646}


 78%|███████▊  | 156/200 [52:40<15:24, 21.02s/it]

Losses {'ner': 8.938518928275002}


 78%|███████▊  | 157/200 [53:01<15:03, 21.00s/it]

Losses {'ner': 6.820578017664389}


 79%|███████▉  | 158/200 [53:22<14:42, 21.02s/it]

Losses {'ner': 6.879965945786554}


 80%|███████▉  | 159/200 [53:43<14:22, 21.05s/it]

Losses {'ner': 6.686427203426162}


 80%|████████  | 160/200 [54:04<14:02, 21.07s/it]

Losses {'ner': 6.977196755212512}


 80%|████████  | 161/200 [54:25<13:40, 21.05s/it]

Losses {'ner': 7.586260093712144}


 81%|████████  | 162/200 [54:46<13:20, 21.06s/it]

Losses {'ner': 5.975320568035454}


 82%|████████▏ | 163/200 [55:07<12:58, 21.04s/it]

Losses {'ner': 6.619361695190838}


 82%|████████▏ | 164/200 [55:28<12:37, 21.05s/it]

Losses {'ner': 7.837702067500126}


 82%|████████▎ | 165/200 [55:49<12:16, 21.03s/it]

Losses {'ner': 6.771799085918621}


 83%|████████▎ | 166/200 [56:10<11:54, 21.03s/it]

Losses {'ner': 6.798376261159272}


 84%|████████▎ | 167/200 [56:31<11:34, 21.05s/it]

Losses {'ner': 6.784056674701372}


 84%|████████▍ | 168/200 [56:52<11:13, 21.05s/it]

Losses {'ner': 7.571662164814864}


 84%|████████▍ | 169/200 [57:13<10:52, 21.06s/it]

Losses {'ner': 6.213257077459141}


 85%|████████▌ | 170/200 [57:34<10:32, 21.09s/it]

Losses {'ner': 7.0322339508983776}


 86%|████████▌ | 171/200 [57:55<10:10, 21.05s/it]

Losses {'ner': 7.918555041852124}


 86%|████████▌ | 172/200 [58:16<09:50, 21.08s/it]

Losses {'ner': 5.8100739137333335}


 86%|████████▋ | 173/200 [58:38<09:30, 21.14s/it]

Losses {'ner': 6.277828105839868}


 87%|████████▋ | 174/200 [58:59<09:10, 21.16s/it]

Losses {'ner': 5.2651647364721015}


 88%|████████▊ | 175/200 [59:20<08:50, 21.20s/it]

Losses {'ner': 5.434310976645955}


 88%|████████▊ | 176/200 [59:42<08:29, 21.24s/it]

Losses {'ner': 6.690585658705419}


 88%|████████▊ | 177/200 [1:00:03<08:10, 21.32s/it]

Losses {'ner': 6.7551557611233095}


 89%|████████▉ | 178/200 [1:00:25<07:51, 21.43s/it]

Losses {'ner': 6.765581199210885}


 90%|████████▉ | 179/200 [1:00:46<07:28, 21.37s/it]

Losses {'ner': 7.458705012662227}


 90%|█████████ | 180/200 [1:01:07<07:05, 21.30s/it]

Losses {'ner': 5.330478786368008}


 90%|█████████ | 181/200 [1:01:28<06:43, 21.22s/it]

Losses {'ner': 5.699834147256227}


 91%|█████████ | 182/200 [1:01:49<06:22, 21.24s/it]

Losses {'ner': 6.65225279103903}


 92%|█████████▏| 183/200 [1:02:11<06:01, 21.24s/it]

Losses {'ner': 6.63090064193594}


 92%|█████████▏| 184/200 [1:02:32<05:39, 21.21s/it]

Losses {'ner': 7.289410451492381}


 92%|█████████▎| 185/200 [1:02:53<05:17, 21.19s/it]

Losses {'ner': 6.722768778680419}


 93%|█████████▎| 186/200 [1:03:14<04:56, 21.17s/it]

Losses {'ner': 7.361140222349716}


 94%|█████████▎| 187/200 [1:03:35<04:34, 21.13s/it]

Losses {'ner': 6.600860188507861}


 94%|█████████▍| 188/200 [1:03:56<04:13, 21.09s/it]

Losses {'ner': 7.017180221996939}


 94%|█████████▍| 189/200 [1:04:17<03:51, 21.08s/it]

Losses {'ner': 5.5177874887489695}


 95%|█████████▌| 190/200 [1:04:38<03:30, 21.04s/it]

Losses {'ner': 6.370852583517594}


 96%|█████████▌| 191/200 [1:04:59<03:09, 21.04s/it]

Losses {'ner': 6.320042024002534}


 96%|█████████▌| 192/200 [1:05:20<02:48, 21.02s/it]

Losses {'ner': 6.464296965857849}


 96%|█████████▋| 193/200 [1:05:41<02:27, 21.09s/it]

Losses {'ner': 5.960574503401474}


 97%|█████████▋| 194/200 [1:06:03<02:07, 21.21s/it]

Losses {'ner': 6.533199276107758}


 98%|█████████▊| 195/200 [1:06:24<01:46, 21.23s/it]

Losses {'ner': 5.430544374156393}


 98%|█████████▊| 196/200 [1:06:45<01:24, 21.18s/it]

Losses {'ner': 5.484712155914632}


 98%|█████████▊| 197/200 [1:07:06<01:03, 21.18s/it]

Losses {'ner': 4.826865540936239}


 99%|█████████▉| 198/200 [1:07:28<00:42, 21.19s/it]

Losses {'ner': 5.478014829023268}


100%|█████████▉| 199/200 [1:07:49<00:21, 21.17s/it]

Losses {'ner': 5.044219627136054}


100%|██████████| 200/200 [1:08:10<00:00, 21.15s/it]

Losses {'ner': 5.666175761633694}
Saved model to spacy/model





In [42]:
## Test Model
nlp2 = spacy.load('spacy/model')
counter = 0
id_csv = []
start_csv = []
end_csv = []
label_csv = []
label = "Loaded_Language"
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    currentid = idn[counter]
    for ent in doc.ents:
        x = ent
        a = x[0]
        start = a.idx
        a = x[-1]
        end = a.idx+len(a)
        start_csv.append(start)
        end_csv.append(end)
        id_csv.append(currentid)
        label_csv.append(label)
    counter = counter + 1
    #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])



In [43]:
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    currentid = ids[counter]
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    for ent in doc.ents:
        print (ent)
        print(ent[0].idx)
    break

Entities [('This severe and unprecedented', 'Loaded_Language')]
This severe and unprecedented
2476


In [44]:
df = {"ID": id_csv, "L": label_csv, "S": start_csv, "E": end_csv}
df = pd.DataFrame(df)
df.to_csv('../task3/predictions_spacy.csv', sep='\t', index=False, header=False) 

In [45]:
df

Unnamed: 0,ID,L,S,E
0,999001323,Loaded_Language,2476,2505
1,783774960,Loaded_Language,4391,4401
2,779309765,Loaded_Language,169,178
3,779309765,Loaded_Language,2511,2516
4,779309765,Loaded_Language,2957,2974
5,999001256,Loaded_Language,571,583
6,707772906,Loaded_Language,3571,3585
7,730019938,Loaded_Language,705,726
8,730019938,Loaded_Language,1080,1088
9,730019938,Loaded_Language,1360,1365
