In [84]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """London's largest industry remains finance, it is the largest financial exporter
in the world which makes a significant contribution to the UK's balance of payments."""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['London', "'s", 'largest', 'industry', 'remains', 'finance', ',', 'it', 'is', 'the', 'largest', 'financial', 'exporter', '\n', 'in', 'the', 'world', 'which', 'makes', 'a', 'significant', 'contribution', 'to', 'the', 'UK', "'s", 'balance', 'of', 'payments', '.']


In [85]:
# sentence tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """London's largest industry remains finance, it is the largest financial exporter
in the world which makes a significant contribution to the UK's balance of payments."""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["London's largest industry remains finance, it is the largest financial exporter\nin the world which makes a significant contribution to the UK's balance of payments."]


In [86]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First twenty stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First twenty stop words: ['herein', 'while', 'him', 'go', 'hence', 'myself', 'nobody', 'should', 'n‘t', 'of', 'show', 'whereby', 'he', 'becomes', 'sometimes', 'say', 'fifty', 'even', 'see', 'itself']


In [87]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [London, largest, industry, remains, finance, ,, largest, financial, exporter, 
, world, makes, significant, contribution, UK, balance, payments, .]


In [89]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"""London's largest industry remains finance, it is the largest financial exporter
           in the world which makes a significant contribution to the UK's balance of payments.""")

for word in docs:
    print(word.text,word.pos_)

London PROPN
's PART
largest ADJ
industry NOUN
remains VERB
finance NOUN
, PUNCT
it PRON
is VERB
the DET
largest ADJ
financial ADJ
exporter NOUN

            SPACE
in ADP
the DET
world NOUN
which DET
makes VERB
a DET
significant ADJ
contribution NOUN
to ADP
the DET
UK PROPN
's PART
balance NOUN
of ADP
payments NOUN
. PUNCT


In [90]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

nytimes= nlp(u"""London's largest industry remains finance, it is the largest financial exporter in the world which makes a significant contribution to the UK's balance of payments.
In the 2017 Global Financial Centres Index, London was ranked as having the most competitive financial center in the world. 
However, in the 2018 ranking, London has lost that title to New York City. 
It ranks alongside cities such as, Singapore, Hong Kong, Tokyo, San Francisco, Chicago, Sydney, Boston, and Toronto. 
The City of London is home to exchanges, banks, brokers, investment managers, pension funds, hedge funds, private equity firms, insurance companies and reinsurance markets. 
London is notable as a centre of international finance where foreign participants in financial markets come to deal with one another.[2][39] 
It is also home to the Bank of England, the second oldest central bank in the world, and the European Banking Authority, although the latter is moving to Paris in March 2019 following the Brexit referendum of 2016.
Other key institutions are Lloyd's of London for insurance, the Baltic Exchange for shipping.
A second financial district has developed at Canary Wharf to the east of the City, which includes the global headquarters of two of the world's largest banks, HSBC and Barclays, the rest-of-the-world headquarters of Citigroup and the headquarters of the global news service Reuters.
London handled 36.7% of global currency transactions in 2009 – an average daily turnover of US$1.85 trillion – with more US dollars traded in London than New York, and more Euros traded than in every other city in Europe combined. 
London is the leading centre for international bank lending, derivatives markets, money markets, international insurance,[52] trading in gold, silver and base metals through the London bullion market and London Metal Exchange,[53] and issuance of international debt securities.[54][55][56]
Financial services in London benefited from the UK's membership of the European Union,[57] although this may end following the decision of the United Kingdom to leave the European Union. 
The position of London as a financial centre may be further enhanced by a free trade agreement between the UK and the USA.[58]
The combination of lax regulation and London's financial institutions providing sophisticated methods to launder proceeds from criminal activity around the world, including those from drug trade, makes the City a global hub for illicit finance and London a safe haven for the world's malfeasants, according to research papers and reports published in the mid-2010s.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(London, 'GPE', 384),
 (UK, 'GPE', 384),
 (Global Financial Centres Index, 'ORG', 383),
 (London, 'GPE', 384),
 (2018, 'DATE', 391),
 (London, 'GPE', 384),
 (New York City, 'GPE', 384),
 (Singapore, 'GPE', 384),
 (Hong Kong, 'GPE', 384),
 (Tokyo, 'GPE', 384),
 (San Francisco, 'GPE', 384),
 (Chicago, 'GPE', 384),
 (Sydney, 'GPE', 384),
 (Boston, 'GPE', 384),
 (Toronto, 'GPE', 384),
 (The City of London, 'GPE', 384),
 (London, 'GPE', 384),
 (the Bank of England, 'ORG', 383),
 (second, 'ORDINAL', 396),
 (the European Banking Authority, 'ORG', 383),
 (Paris, 'GPE', 384),
 (March 2019, 'DATE', 391),
 (Brexit, 'GPE', 384),
 (2016, 'DATE', 391),
 (Lloyd's of London, 'ORG', 383),
 (the Baltic Exchange, 'ORG', 383),
 (second, 'ORDINAL', 396),
 (Canary Wharf, 'ORG', 383),
 (the City, 'GPE', 384),
 (two, 'CARDINAL', 397),
 (HSBC, 'ORG', 383),
 (Barclays, 'GPE', 384),
 (Citigroup, 'ORG', 383),
 (Reuters, 'ORG', 383),
 (London, 'GPE', 384),
 (36.7%, 'PERCENT', 393),
 (2009, 'DATE', 391),
 (daily, 

In [91]:
displacy.render(nytimes, style = "ent",jupyter = True)

In [95]:
import spacy

In [96]:
en = spacy.load(r"C:\Users\SOUMEL ZUTSHI\Anaconda3\envs\tensorflow\Lib\site-packages\en_core_web_lg\en_core_web_lg-2.1.0")

In [97]:
sents = en(open(r'C:\Users\SOUMEL ZUTSHI\Desktop\Article.txt').read())
people = [ee for ee in sents.ents if ee.label_ == 'PERSON']
places = [ee for ee in sents.ents if ee.label_ == 'GPE']
organisation = [ee for ee in sents.ents if ee.label_ == 'ORG']

In [98]:
people

[]

In [99]:
places

[London,
 UK,
 London,
 London,
 New York City,
 Singapore,
 Hong Kong,
 Tokyo,
 San Francisco,
 Chicago,
 Sydney,
 Boston,
 Toronto,
 The City of London,
 Paris,
 Brexit,
 US,
 London,
 New York,
 London,
 London,
 UK,
 the United Kingdom,
 London,
 UK,
 USA,
 London,
 London]

In [100]:
organisation

[the Bank of England,
 the European Banking Authority,
 Lloyd's of London,
 the Baltic Exchange,
 HSBC,
 Barclays,
 Citigroup,
 Reuters,
 London Metal Exchange,
 the European Union,
 the European Union]

In [102]:
output_file = open(r'C:\Users\SOUMEL ZUTSHI\Desktop\Output.txt','w+')

In [103]:
output_file.write("PEOPLE: ")
output_file.write(str(people))
output_file.write('\n')

1

In [104]:
output_file.write("PLACES: ")
output_file.write(str(places))
output_file.write('\n')

1

In [105]:
output_file.write("ORGANISATIONS: ")
output_file.write(str(organisation))
output_file.write('\n')

1

In [106]:
output_file.close()