In [2]:
import pickle

from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load the sentence-bert model from the HuggingFace model hub

#https://github.com/UKPLab/sentence-transformers

tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
model = AutoModel.from_pretrained('deepset/sentence_bert')

sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

# now find the labels with the highest cosine similarities to
# the sentence
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

Some weights of the model checkpoint at deepset/sentence_bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


label: politics 	 similarity: 0.21561527252197266
label: business 	 similarity: 0.0045240651816129684
label: art & culture 	 similarity: -0.027396898716688156




In [4]:
data = []
# with open('/Users/shrikritisingh/Downloads/15rows.pickle', 'rb') as f:
with open('thesaurus.pickle', 'rb') as f:
    thesaurus = pickle.load(f)

In [5]:
with open('preprocessedData.pickle', 'rb') as f:
    data = pickle.load(f)

In [26]:
data.text[7]

'Event and Historical Information:The POLTALLOCH was a steel four-masted bark of 2,138 net tons and measuring 86.7 x 12.8 x 7.4m. The vessel had four masts, rigged with royal sails above double top and topgallant sails, and was built in 1893 at Belfast by Workman, Clark &amp Co. Ltd. At time of loss, the POLTALLOCH was owned by Eschen &amp Minor of the US and was on voyage from Caleta Buena in Chile to Leith via Queenstown with a cargo of nitrate soda. On 2 January 1916 the vessel ran aground on St. Patricks Causeway, 13 miles north west of Barmouth. The captain and five of the crew of twenty-five landed in their boat at Harlech the following morning. A Liverpool Salvage Association cable stated on 5 January that the POLTALLOCH had a very heavy port list, and the cargo was melting. Due to such a heavy list, tugs did not attempt to tow the vessel, and by 10 January the ship had turned over on her broadside and became a total loss. Five Germans were among the crew of the POLTALLOCH who w

In [6]:
sentence = data.text[7]
labels = ['fishing vessel', 'harbour service craft', 'high speed launch',
       'ketch', 'landing craft', 'launch', 'lighter', 'lugger',
       'maritime craft', 'motor drifter', 'motor trawler', 'motor vessel',
       'naval support vessel', 'recreational vessel', 'barge', 'barque',
       'brig', 'cargo vessel', 'craft', 'customs boat', 'dredger',
       'drifter', 'east indiaman', 'escort vessel',
       'sailing vessel by form type', 'passenger vessel', 'patrol vessel',
       'pinnace', 'yacht', 'yawl', 'sailing vessel by rig', 'schooner',
       'service vessel', 'ship of the line', 'sloop', 'smack',
       'square rigged vessel', 'steam drifter', 'steam tug', 'submarine',
       'tanker', 'transport vessel', 'trawler', 'trinity house vessel',
       'tug', 'warship', 'wherry', 'factory ship', 'rescue vessel']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

# now find the labels with the highest cosine similarities to
# the sentence
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

label: harbour service craft 	 similarity: 0.22816035151481628
label: trawler 	 similarity: 0.21835929155349731
label: barge 	 similarity: 0.2153700292110443
label: schooner 	 similarity: 0.21411468088626862
label: customs boat 	 similarity: 0.195928692817688
label: ship of the line 	 similarity: 0.19422584772109985
label: tanker 	 similarity: 0.185998797416687
label: rescue vessel 	 similarity: 0.18187850713729858
label: dredger 	 similarity: 0.17204241454601288
label: submarine 	 similarity: 0.16963952779769897
label: sailing vessel by rig 	 similarity: 0.1694379597902298
label: steam tug 	 similarity: 0.16848576068878174
label: tug 	 similarity: 0.16735532879829407
label: sailing vessel by form type 	 similarity: 0.1625986397266388
label: barque 	 similarity: 0.16130493581295013
label: ketch 	 similarity: 0.15837404131889343
label: sloop 	 similarity: 0.15647214651107788
label: factory ship 	 similarity: 0.15111039578914642
label: motor trawler 	 similarity: 0.14955158531665802
labe

In [19]:
data['res'] = data['text'].isin(labels)

In [21]:
data[data.res==True]

Unnamed: 0,text,res


In [35]:
thesaurus.BROAD_TERM.value_counts()

maritime craft                 43
sailing vessel by form type    27
warship                        16
barge                           9
fishing vessel                  8
service vessel                  7
dredger                         6
ship of the line                6
naval support vessel            6
cargo vessel                    6
yacht                           6
trawler                         5
square rigged vessel            5
sailing vessel by rig           4
passenger vessel                3
drifter                         3
schooner                        3
east indiaman                   3
transport vessel                2
harbour service craft           2
tug                             2
launch                          2
rescue vessel                   2
craft                           2
brig                            2
ketch                           2
landing craft                   2
escort vessel                   1
steam tug                       1
factory ship  

In [36]:
thesaurus.TERM.value_counts()

billyboy                     2
steam yacht                  2
motor yacht                  2
boat                         2
requisitioned steam yacht    2
                            ..
concrete barge               1
dumb barge                   1
dumb concrete barge          1
dumb hopper barge            1
kuff                         1
Name: TERM, Length: 199, dtype: int64

In [7]:
df = thesaurus[thesaurus.groupby('TERM')['TERM'].transform('size') > 1]

In [8]:
df = df.rename({'TERM': 'NARROW_TERM'}, axis=1)

In [9]:
df.sort_values(by=['NARROW_TERM'])

Unnamed: 0,NARROW_TERM,BROAD_TERM,Match Getty,Target Concept,Getty BT
71,billyboy,barge,broad match,sailing vessels by rig type,sailing vessels
195,billyboy,sailing vessel by form type,related match,coasters,watercraft by location or context
22,boat,maritime craft,exact match,boats,watercraft by general type
197,boat,sailing vessel by form type,exact match,boats,watercraft by general type
116,hospital ship,naval support vessel,exact match,hospital ships naval auxiliary ships naval shi...,naval auxiliary ships
120,hospital ship,passenger vessel,exact match,hospital ships naval auxiliary ships naval shi...,naval auxiliary ships
125,motor yacht,yacht,broad match,yachts,competition and recreation craft
190,motor yacht,yacht,broad match,yachts,competition and recreation craft
124,requisitioned steam yacht,yacht,broad match,yachts,competition and recreation craft
126,requisitioned steam yacht,yacht,broad match,yachts,competition and recreation craft


In [10]:
sentence = data.text[7]
labels = ['maritime', 'sailing', 'warship', 'others']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

# now find the labels with the highest cosine similarities to
# the sentence
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')



label: warship 	 similarity: 0.12327991425991058
label: sailing 	 similarity: 0.10645808279514313
label: maritime 	 similarity: 0.06913551688194275
label: others 	 similarity: -0.1429973542690277


In [52]:
len(sentence)

1639

In [61]:
i=0
j=0
for d in data.text:
    if len(sentence) > 1600:
        print(d)
        print(zsl(d))
        j+=1
    i+=1
    if j > 10:
        break

The Caerphilly Palace opened in 1904 initially as a skating rink. The building was sold in 1910 to George H. Pitt who converted the Palace into Pitts Picture Palace with seating for 1,500 people. The conversion work was carried out by MrW. Branch of Abersychan with electric installations by Messr Crossley of Cardiff.The Palace used to run variety turns in between films and proved successful. The business changed ownership several times over the next few years and the showing of films and plays ceased around the time talkies emerged withthe building then becomingunused for many years. A local company, Ryans, used the building to store vehicles for a while before it being demolished in 1976.Meilyr Powel, RCAHMW, November 2020Source:Entertaining South Wales - C, overthefootlights.co.uk, pp.3-4




label: warship 	 similarity: 0.06033662334084511
label: maritime 	 similarity: 0.022682547569274902
label: sailing 	 similarity: 0.0035895584151148796
label: others 	 similarity: -0.008146878331899643
None
From Entertaining South Wales:The independent Plaza Cinema opened in North Road on March 12th 1928 with 1,500 seats. The opening film was Norma Talmadge in the (silent) version of &ldquoCamille&rdquo. The film was accompanied by a large in-house orchestra - a standard feature of many larger cinemas, though this was something that would mostly disappear due to manpower shortages during the Great War. Over the course of its more than fifty-year existence it was taken over by the Cardiff Cinema Circuit and then the Jackson Withers Circuit, and ultimately bought by the Rank Organisation. The cinema closed on October 17th 1981 with a horror double-bill of &ldquoAlien&rdquo and &ldquoThe Fog&rdquo. The building remained unused for several years before it was finally demolished and replaced

RuntimeError: The expanded size of the tensor (736) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [5, 736].  Tensor sizes: [1, 512]

In [62]:
sentence="The church of St Meilig retains some 13th-14th century fabric in the tower, but was largely rebuilt by W.J. and A.H. Worthington of London in 1853-55 at the cost of &pound943 using mixed local stones, with Bath stone dressings.The nave has a south porch, the chancel has a vestry on the north side, and a west tower. The porch is in early Decorated style.The tower is of 3 stages, the top stage rebuilt in the 19th century. Two phases of early slit windows, all now blocked, and a crenellated parapet.The nave of 4 bays has an impressive open roof on orch-braced collar beam trusses springing from wall corbels. The chancel has an open trussed rafter roof with scissor braces and a deep oak cornice pierced with trefoils. The floor is paved with tiles.The limestone pulpit is octagonal, approached by 6 steps with carved oak handrail on twisted iron supports. The 19th century font is an octagonal bowl carved with ballflowers, fleurons and chequer panels, all raised on four clustered columns. The earlier, 13th century font is a simple round bowl with a horizontal flat central cordon, now bound with iron, on a tapered base. The organ of 1880, was restored by Henry James, London. At the west end is an important 11th century cross slab said originally to have stood at Croes feilig and set up in the churchyard in the 12th century (see NPRN 96531).There is an east window in memorial to Henry Beavan, the family who brought the church restoration to fruition. On the south side, a window with Christ with children, dedicated to William Elmslie, &dagger c.1853 in China Christ walking on waters, commemorating Captain R.Collinsons survival of the arctic expedition of the ship Enterprise. On the north side a window in memorial to Octavia Ramsey of Maesllwch Castle, &dagger1850.There are various wall monuments, including many from the previous church, reset after the rebuilding. In chancel, limestone Gothic aedicule, c.1870 to Hugh Beavan of Brynrhydd House, &dagger1837 white marble sarcophagus relief on black ground, by I.E.Thomas of London, to John Pugh of Porthgoley, &dagger1824 draped casket over white marble tablet set against veined marble, also by I.E.Thomas, to Ann Pugh of Porthgoley, &dagger1846 Gothic surround to marble tablet to John Pugh of Gare (Gaer), &dagger1788 white marble on grey, to Ann Gunter and William of Abergavenny &dagger1805 and 1808 (descendants of Sir Peter Gunter of Tregunter). There is also a Great War tablet. Against the west wall, a canvas Royal Arms of George III.The dedication to Meilig indicates an early origin. Meilig, abbott and confessor, was the son of Caw of Pictland and brother to the monk Gildas. He was born on Clydeside c. 650 and is mentioned in the C10 Culhwch and Olwen story, and in the Book of Llan Daf"

In [30]:
def zsl(sentence):
    labels = ['marine', 'sail', 'battleship', 'others']

    inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                        return_tensors='pt',
                                        pad_to_max_length=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    output = model(input_ids, attention_mask=attention_mask)[0]
    sentence_rep = output[:1].mean(dim=1)
    label_reps = output[1:].mean(dim=1)

    similarities = F.cosine_similarity(sentence_rep, label_reps)
    closest = similarities.argsort(descending=True)
    for ind in closest:
        print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

In [31]:
sentence = "The church was largely rebuilt in 1853-55 by W.J. and A.H. Worthington of London. The nave of 4 bays has an impressive open roof on orch-braced collar beam trusses. The chancel has an open trussed rafter roof with scissor braces and a deep oak cornice pierced with trefoils. The limestone pulpit is octagonal, approached by 6 steps with carved oak handrail."
zsl(sentence)

label: battleship 	 similarity: 0.19067125022411346
label: sail 	 similarity: 0.11386388540267944
label: marine 	 similarity: 0.06501930952072144
label: others 	 similarity: 0.05681919679045677


In [21]:
sentence = data.text[7]
print(sentence)
zsl(sentence)

Event and Historical Information:The POLTALLOCH was a steel four-masted bark of 2,138 net tons and measuring 86.7 x 12.8 x 7.4m. The vessel had four masts, rigged with royal sails above double top and topgallant sails, and was built in 1893 at Belfast by Workman, Clark &amp Co. Ltd. At time of loss, the POLTALLOCH was owned by Eschen &amp Minor of the US and was on voyage from Caleta Buena in Chile to Leith via Queenstown with a cargo of nitrate soda. On 2 January 1916 the vessel ran aground on St. Patricks Causeway, 13 miles north west of Barmouth. The captain and five of the crew of twenty-five landed in their boat at Harlech the following morning. A Liverpool Salvage Association cable stated on 5 January that the POLTALLOCH had a very heavy port list, and the cargo was melting. Due to such a heavy list, tugs did not attempt to tow the vessel, and by 10 January the ship had turned over on her broadside and became a total loss. Five Germans were among the crew of the POLTALLOCH who we



label: marine 	 similarity: 0.09208926558494568
label: sail 	 similarity: -0.006853485479950905
label: war 	 similarity: -0.04180356115102768
label: others 	 similarity: -0.1429973542690277
