In [1]:
import spacy

ModuleNotFoundError: No module named 'spacy'

In [5]:
nlp=spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x2a9330ce508>

In [6]:
# Parse text through the `nlp` model
my_text = """The economic situation of the country is on edge , as the stock 
market crashed causing loss of millions. Citizens who had their main investment 
in the share-market are facing a great loss. Many companies might lay off 
thousands of people to reduce labor cost"""

#The Doc Object
my_doc = nlp(my_text)
type(my_doc)

spacy.tokens.doc.Doc

In [None]:
# Tokenization with spaCy
# Printing the tokens of a doc
for token in my_doc:
  print(token.text)

In [None]:
#Text-Preprocessing with spaCy
for token in my_doc:
  print(token.text,'--',token.is_stop,'---',token.is_punct)

In [None]:
# Removing StopWords and punctuations
my_doc_cleaned = [token for token in my_doc if not token.is_stop and not token.is_punct]

for token in my_doc_cleaned:
  print(token.text)

In [9]:
# Lemmatizing the tokens of a doc
text='she played chess against rita she likes playing chess.'
doc=nlp(text)
for token in doc:
  print(token.lemma_)

she
play
chess
against
rita
she
like
play
chess
.


In [10]:
# Strings to Hashes and Back
doc = nlp("I love traveling")

# Look up the hash for the word "traveling"
word_hash = nlp.vocab.strings["traveling"]
print(word_hash)

# Look up the word_hash to get the string
word_string = nlp.vocab.strings[word_hash]
print(word_string)

4386335507830398018
traveling


In [11]:
#Interestingly, a word will have the same hash value irrespective of which document it occurs in or which spaCy model 
#is being used.
# Create two different doc with a common word
doc1 = nlp('Raymond shirts are famous')
doc2 = nlp('I washed my shirts ')

# Printing the hash value for each token in the doc

print('-------DOC 1-------')
for token in doc1:
  hash_value=nlp.vocab.strings[token.text]
  print(token.text ,' ',hash_value)

print('-------DOC 2-------')
for token in doc2:
  hash_value=nlp.vocab.strings[token.text]
  print(token.text ,' ',hash_value)

-------DOC 1-------
Raymond   5945540083247941101
shirts   9181315343169869855
are   5012629990875267006
famous   17809293829314912000
-------DOC 2-------
I   4690420944186131903
washed   5520327350569975027
my   227504873216781231
shirts   9181315343169869855


In [12]:
#Lexical attributes of spaCy
# Printing the tokens which are like numbers
text=' 2020 is far worse than 2009'
doc=nlp(text)
for token in doc:
  if token.like_num:
    print(token)

2020
2009


In [13]:
production_text=' Production in chennai is 87 %. In Kolkata, produce it as low as 43 %. In Bangalore, production ia as good as 98 %.In mysore, production is average around 78 %'

In [14]:
# Finding the tokens which are numbers followed by % 

production_doc=nlp(production_text)

for token in production_doc:
  if token.like_num:
    index_of_next_token=token.i+ 1
    next_token=production_doc[index_of_next_token]
    if next_token.text == '%':
      print(token.text)

87
43
98
78


In [15]:
#Detecting Email Addresses
# text containing employee details
employee_text=""" name : Koushiki age: 45 email : koushiki@gmail.com
                 name : Gayathri age: 34 email: gayathri1999@gmail.com
                 name : Ardra age: 60 email : ardra@gmail.com
                 name : pratham parmar age: 15 email : parmar15@yahoo.com
                 name : Shashank age: 54 email: shank@rediffmail.com
                 name : Utkarsh age: 46 email :utkarsh@gmail.com"""

# creating a spacy doc          
employee_doc=nlp(employee_text)

# Printing the tokens which are email through `like_email` attribute
for token in employee_doc:
  if token.like_email:
    print(token.text)

koushiki@gmail.com
gayathri1999@gmail.com
ardra@gmail.com
parmar15@yahoo.com
shank@rediffmail.com
utkarsh@gmail.com


In [None]:
# token.is_alpha : Returns True if the token is an alphabet
# token.is_ascii : Returns True if the token belongs to ascii characters
# token.is_digit : Returns True if the token is a number(0-9)
# token.is_upper : Returns True if the token is upper case alphabet
# token.is_lower : Returns True if the token is lower case alphabet
# token.is_space : Returns True if the token is a space ‘ ‘
# token.is_bracket : Returns True if the token is a bracket
# token.is_quote : Returns True if the token is a quotation mark
# token.like_url : Returns True if the token is similar to a URl (link to website)

In [16]:
#Part of Speech analysis with spaCy
# POS tagging using spaCy
my_text='John plays basketball,if time permits. He played in high school too.'
my_doc=nlp(my_text)
for token in my_doc:
  print(token.text,'---- ',token.pos_)

John ----  PROPN
plays ----  VERB
basketball ----  NOUN
, ----  PUNCT
if ----  SCONJ
time ----  NOUN
permits ----  VERB
. ----  PUNCT
He ----  PRON
played ----  VERB
in ----  ADP
high ----  ADJ
school ----  NOUN
too ----  ADV
. ----  PUNCT


In [17]:
spacy.explain('SCONJ')

'subordinating conjunction'

In [18]:
# How POS tagging helps you in dealing with text based problems.
# Raw text document
raw_text="""I liked the movies etc The movie had good direction  The movie was amazing i.e.
            The movie was average direction was not bad The cinematography was nice. i.e.
            The movie was a bit lengthy  otherwise fantastic  etc etc"""

# Creating a spacy object
raw_doc=nlp(raw_text)

# Checking if POS tag is X and printing them
print('The junk values are..')
for token in raw_doc:
  if token.pos_=='X':
    print(token.text)

print('After removing junk')
# Removing the tokens whose POS tag is junk.
clean_doc=[token for token in raw_doc if not token.pos_=='X']
print(clean_doc)

The junk values are..
etc
i.e.
i.e.
etc
etc
After removing junk
[I, liked, the, movies, The, movie, had, good, direction,  , The, movie, was, amazing, 
            , The, movie, was, average, direction, was, not, bad, The, cinematography, was, nice, ., 
            , The, movie, was, a, bit, lengthy,  , otherwise, fantastic,  ]


In [19]:
# creating a dictionary with parts of speeach &amp; corresponding token numbers.

all_tags = {token.pos: token.pos_ for token in raw_doc}
print(all_tags)

{95: 'PRON', 100: 'VERB', 90: 'DET', 92: 'NOUN', 101: 'X', 84: 'ADJ', 103: 'SPACE', 87: 'AUX', 94: 'PART', 97: 'PUNCT', 86: 'ADV'}


In [20]:
# Importing displacy
from spacy import displacy
my_text='She never like playing , reading was her hobby'
my_doc=nlp(my_text)

# displaying tokens with their POS tags
displacy.render(my_doc,style='dep',jupyter=True)

In [21]:
# Named Entity Recognition
# Preparing the spaCy document
text='Tony Stark owns the company StarkEnterprises . Emily Clark works at Microsoft and lives in Manchester. She loves to read the Bible and learn French'
doc=nlp(text)

# Printing the named entities
print(doc.ents)

(Tony Stark, Clark, Microsoft, Bible, French)


In [None]:
# PERSON : Denotes names of people
# GPE : Denotes places like counties, cities, states.
# ORG : Denotes organizations or companies
# WORK_OF_ART : Denotes titles of books, fimls,songs and other arts
# PRODUCT : Denotes products such as vehicles, food items ,furniture and so on.
# EVENT : Denotes historical events like wars, disasters ,etc…
# LANGUAGE : All the recognized languages across the globe.

In [22]:
# Printing labels of entities.
for entity in doc.ents:
  print(entity.text,'--- ',entity.label_)

Tony Stark ---  PERSON
Clark ---  PERSON
Microsoft ---  ORG
Bible ---  WORK_OF_ART
French ---  LANGUAGE


In [23]:
# Using displacy for visualizing NER
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)

In [24]:
# NER Application 1: Extracting brand names with Named Entity Recognition
mobile_industry_article=""" 30 Major mobile phone brands Compete in India – A Case Study of Success and Failures
Is the Indian mobile market a terrible War Zone? We have more than 30 brands competing with each other. 
Let’s find out some insights about the world second-largest mobile bazaar.
There is a massive invasion by Chinese mobile brands in India in the last four years. 
Some of the brands have been able to make a mark while others like Meizu, Coolpad, ZTE, and LeEco are a failure.
On one side, there are brands like Sony or HTC that have quit from the Indian market on the other side we have new brands like Realme or iQOO entering the marketing in recent months.
The mobile market is so competitive that some of the brands like Micromax, which had over 18% share back in 2014, now have less than 5%. 
Even the market leader Samsung with a 34% market share in 2014, now has a 21% share whereas Xiaomi has become a market leader. 
The battle is fierce and to sustain and scale-up is going to be very difficult for any new entrant.
new comers in Indian Mobile MarketiQOO –They have recently (March 2020) launched the iQOO 3 in India with its first 5G phone – iQOO 3. 
The new brand is part of the Vivo or the BBK electronics group that also owns several other brands like Oppo, Oneplus and Realme.Realme – Realme launched the first-ever phone – Realme 1 in November 2018 and has quickly became a popular brand in India. The brand is one of the highest sellers in online space and even reached a 16% market share threatening Xiaomi’s dominance.iVoomi – In 2017, we have seen the entry of some new Chinese mobile brands likeiVoomi which focuses on the sub 10k price range, and is a popular online player. They have an association with Flipkart.Techno &amp; Infinix – Transsion Group’s Tecno and Infinix brands debuted in India in mid-2017 and are focusing on the low end and mid-range phones in the price range of Rs. 5000 to Rs. 12000.10.OR &amp; Lephone – 10.OR has a partnership with Amazon India and is an exclusive online brand with phones like 10.OR D, G and E. However, the brand is not very aggressive currently.Kult – Kult is another player who launched a very aggressively priced Kult Beyond mobile in 2017 and followed up by launching 2-3 more models.However, most of these new brands are finding it difficult to strengthen their footing in India. As big brands like Xiaomi leave no stone unturned to make things difficult.Also, it is worth noting that there is less Chinese players coming to India now. As either all the big brands have already set shop or burnt their hands and retreated to the homeland China.Chinese/ Global  Brands Which failed or are at the Verge of Failing in India?
There are a lot more failures in the market than the success stories. 
Let’s first look at the failures and then we will also discuss why some brands were able to succeed in India.
HTC – The biggest surprise this year for me was the failure of HTC in India. 
The brand has been in the country for many years, in fact, they were the first brand to launch Android mobiles. 
Finally HTC decided to call it a day in July 2018.LeEco – LeEco looked promising and even threatening to Xiaomi when it came to India. 
The company launched a series of new phones and smart TVs at affordable rates. Unfortunately, poor financial planning back home caused the brand to fail in India too.
LG – The company seems to have lost focus and are doing poorly in all segments. 
While the budget and mid-range offering are uncompetitive, the high-end models are not preferred by buyers.
Sony – Absurd pricing and lack of ability to understand the Indian buyers have caused Sony to shrink mobile operations in India. 
In the last 2 years, there are far fewer launches and hardly any promotions or hype around the new products.
Meizu – Meizu is also a struggling brand in India and is going nowhere with the current strategy. 
There are hardly any popular mobiles nor a retail presence.
ZTE – The company was aggressive till last year with several new phones launching under the Nubia banner, but with recent issues in the US, they have even lost the plot in India.
Coolpad – I still remember the first meeting with Coolpad CEO in Mumbai when the brand started operations. There were big dreams and ambitions, but the company has not been able to deliver and keep up with the rivals in the last 1 year.Gionee – Gionee was doing well in the retail, but the infighting in the company and loss of focus from the Chinese parent company has made it a failure. The company is planning a comeback. However, we will have to wait and see when that happens."""

In [25]:
# creating spacy doc
mobile_doc=nlp(mobile_industry_article)

# List to store name of mobile companies
list_of_org=[]

# Appending entities which havel the label 'ORG' to the list
for entity in mobile_doc.ents:
  if entity.label_=='ORG':
    list_of_org.append(entity.text)

print(list_of_org)

['ZTE', 'Sony', 'Realme', 'Micromax', 'BBK', 'Oppo, Oneplus and Realme', 'Realme', 'Realme', 'Xiaomi’s', 'Flipkart.', 'Infinix – Transsion Group’s', 'Infinix', 'Lephone', 'Amazon India', 'Xiaomi', 'the Verge of Failing', 'Android', 'Sony', 'Sony', 'ZTE']


In [26]:
# NER Application 2: Automatically Masking Entities
news_text="""Indian man has allegedly duped nearly 50 businessmen in the UAE of USD 1.6 million and fled the country in the most unlikely way -- on a repatriation flight to Hyderabad, according to a media report on Saturday.Yogesh Ashok Yariava, the prime accused in the fraud, flew from Abu Dhabi to Hyderabad on a Vande Bharat repatriation flight on May 11 with around 170 evacuees, the Gulf News reported.Yariava, the 36-year-old owner of the fraudulent Royal Luck Foodstuff Trading, made bulk purchases worth 6 million dirhams (USD 1.6 million) against post-dated cheques from unsuspecting traders before fleeing to India, the daily said.
The bought goods included facemasks, hand sanitisers, medical gloves (worth nearly 5,00,000 dirhams), rice and nuts (3,93,000 dirhams), tuna, pistachios and saffron (3,00,725 dirhams), French fries and mozzarella cheese (2,29,000 dirhams), frozen Indian beef (2,07,000 dirhams) and halwa and tahina (52,812 dirhams).
The list of items and defrauded persons keeps getting longer as more and more victims come forward, the report said.
The aggrieved traders have filed a case with the Bur Dubai police station.
The traders said when the dud cheques started bouncing they rushed to the Royal Luck's office in Dubai but the shutters were down, even the fraudulent company's warehouses were empty."""

news_doc=nlp(news_text)

In [34]:
# Function to identify  if tokens are named entities and replace them with UNKNOWN
def remove_details(word):
  if word.ent_type_ =='PERSON' or word.ent_type_=='ORG' or word.ent_type_=='GPE':
    return ' UNKNOWN '
  return str(word)


# Function where each token of spacy doc is passed through remove_deatils()
def update_article(doc):
  # iterrating through all entities
  with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
  # Passing each token through remove_details() function.
  tokens = map(remove_details,doc)
  return ' '.join(tokens)

# Passing our news_doc to the function update_article()
update_article(news_doc)

"Indian man has allegedly duped nearly 50 businessmen in the UAE of USD 1.6 million and fled the country in the most unlikely way -- on a repatriation flight to  UNKNOWN  , according to a media report on Saturday .  UNKNOWN  , the prime accused in the fraud , flew from  UNKNOWN   UNKNOWN  repatriation flight on May 11 with around 170 evacuees ,  UNKNOWN  reported .  UNKNOWN  , the 36-year-old owner of the fraudulent  UNKNOWN  , made bulk purchases worth 6 million dirhams ( USD 1.6 million ) against post - dated cheques from unsuspecting traders before fleeing to  UNKNOWN  , the daily said . \n The bought goods included facemasks , hand sanitisers , medical gloves ( worth nearly 5,00,000 dirhams ) , rice and nuts ( 3,93,000 dirhams ) , tuna , pistachios and saffron ( 3,00,725 dirhams ) , French fries and mozzarella cheese ( 2,29,000 dirhams ) , frozen Indian beef ( 2,07,000 dirhams ) and halwa and tahina ( 52,812 dirhams ) . \n The list of items and defrauded persons keeps getting longe

In [None]:
#Rule based Matching
# There will be situations like these, where you’ll need extract specific pattern type phrases from the text. 
# This is called Rule-based matching.
# 3 Types
# Token Matcher
# Phrase Matcher
# Entity Ruler

In [None]:
# #Token Matcher
# The procedure to implement a token matcher is:

# Initialize a Matcher object
# Define the pattern you want to match
# Add the pattern to the matcher
# Pass the text to the matcher to extract the matching positions.

In [36]:
from spacy.matcher import Matcher 
# Initializing the matcher with vocab
matcher = Matcher(nlp.vocab)
matcher

<spacy.matcher.matcher.Matcher at 0x2a9388d3dc8>

In [37]:
# Define the matching pattern
my_pattern=[{"LOWER": "version"}, {"IS_PUNCT": True}, {"LIKE_NUM": True}]

In [44]:
# Define the token matcher
matcher.add('VersionFinder', [my_pattern])

In [45]:
# Run the Token Matcher
my_text = 'The version : 6 of the app was released about a year back and was not very sucessful. As a comeback, six months ago, version : 7 was released and it took the stage. After that , the app has has the limelight till now. On interviewing some sources, we get to know that they have outlined visiond till version : 12 ,the Ultimate.'
my_doc = nlp(my_text)

desired_matches = matcher(my_doc)
desired_matches

[(6950581368505071052, 1, 4),
 (6950581368505071052, 27, 30),
 (6950581368505071052, 65, 68)]

In [46]:
# Extract the matches
for match_id, start, end in desired_matches :
    string_id = nlp.vocab.strings[match_id] 
    span = my_doc[start:end] 
    print(span.text)

version : 6
version : 7
version : 12


In [48]:
# Example 2
text = """I visited Manali last time. Around same budget trips ? "
    I was visiting Ladakh this summer "
    I have planned visiting NewYork and other abroad places for next year"
    Have you ever visited Kodaikanal? """

doc = nlp(text)

# Initialize the matcher
matcher = Matcher(nlp.vocab)

# Write a pattern that matches a form of "visit" + place
my_pattern = [{"LEMMA": "visit"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("Visting_places", [my_pattern])
matches = matcher(doc)

# Counting the no of matches
print(" matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

 matches found: 4
Match found: visited Manali
Match found: visiting Ladakh
Match found: visiting NewYork
Match found: visited Kodaikanal


In [50]:
# Example 3
engineering_text = """If you study aeronautical engineering, you could specialize in aerodynamics, aeroelasticity, 
composites analysis, avionics, propulsion and structures and materials. If you choose to study chemical engineering, you may like to
specialize in chemical reaction engineering, plant design, process engineering, process design or transport phenomena. Civil engineering is the professional practice of designing and developing infrastructure projects. This can be on a huge scale, such as the development of
nationwide transport systems or water supply networks, or on a smaller scale, such as the development of single roads or buildings.
specializations of civil engineering include structural engineering, architectural engineering, transportation engineering, geotechnical engineering,
environmental engineering and hydraulic engineering. Computer engineering concerns the design and prototyping of computing hardware and software. 
This subject merges electrical engineering with computer science, oldest and broadest types of engineering, mechanical engineering is concerned with the design,
manufacturing and maintenance of mechanical systems. You’ll study statics and dynamics, thermodynamics, fluid dynamics, stress analysis, mechanical design and
technical drawing"""

doc = nlp(engineering_text)

In [51]:
# Initializing the matcher
matcher = Matcher(nlp.vocab)

# Write a pattern that matches a form of "noun/adjective"+"engineering"
my_pattern = [{"POS": {"IN": ["NOUN", "ADJ"]}}, {"LOWER": "engineering"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("identify_courses", [my_pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the matching text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 15
Match found: aeronautical engineering
Match found: chemical engineering
Match found: reaction engineering
Match found: process engineering
Match found: Civil engineering
Match found: civil engineering
Match found: structural engineering
Match found: architectural engineering
Match found: transportation engineering
Match found: geotechnical engineering
Match found: environmental engineering
Match found: hydraulic engineering
Match found: Computer engineering
Match found: electrical engineering
Match found: mechanical engineering


In [52]:
# Phrase Matcher
# The procedure to use PhraseMatcher is very similar to Matcher.

# Initialize a PhraseMatcher object with a vocab.
# Define the terms you want to match
# Add the pattern to the matcher
# Run the text through the matcher to extract the matching positions.

In [54]:
from spacy.matcher import PhraseMatcher

# PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Terms to match
terms_list = ['Bruce Wayne', 'Tony Stark', 'Batman', 'Harry Potter', 'Severus Snape']

# Make a list of docs
patterns = [nlp.make_doc(text) for text in terms_list]

matcher.add("phrase_matcher", patterns)

In [55]:
# Matcher Object
fictional_char_doc = nlp("""Superman (first appearance: 1938)  Created by Jerry Siegal and Joe Shuster for Action Comics #1 (DC Comics).Mickey Mouse (1928)  Created by Walt Disney and Ub Iworks for Steamboat Willie.Bugs Bunny (1940)  Created by Warner Bros and originally voiced by Mel Blanc.Batman (1939) Created by Bill Finger and Bob Kane for Detective Comics #27 (DC Comics).
Dorothy Gale (1900)  Created by L. Frank Baum for novel The Wonderful Wizard of Oz. Later portrayed by Judy Garland in the 1939 film adaptation.Darth Vader (1977) Created by George Lucas for Star Wars IV: A New Hope.The Tramp (1914)  Created and portrayed by Charlie Chaplin for Kid Auto Races at Venice.Peter Pan (1902)  Created by J.M. Barrie for novel The Little White Bird.
Indiana Jones (1981)  Created by George Lucas for Raiders of the Lost Ark. Portrayed by Harrison Ford.Rocky Balboa (1976)  Created and portrayed by Sylvester Stallone for Rocky.Vito Corleone (1969) Created by Mario Puzo for novel The Godfather. Later portrayed by Marlon Brando and Robert DeNiro in Coppola’s film adaptation.Han Solo (1977) Created by George Lucas for Star Wars IV: A New Hope. 
Portrayed most famously by Harrison Ford.Homer Simpson (1987)  Created by Matt Groening for The Tracey Ullman Show, later The Simpsons as voiced by Dan Castellaneta.Archie Bunker (1971) Created by Norman Lear for All in the Family. Portrayed by Carroll O’Connor.Norman Bates (1959) Created by Robert Bloch for novel Psycho.  Later portrayed by Anthony Perkins in Hitchcock’s film adaptation.King Kong (1933) 
Created by Edgar Wallace and Merian C Cooper for the film King Kong.Lucy Ricardo (1951) Portrayed by Lucille Ball for I Love Lucy.Spiderman (1962)  Created by Stan Lee and Steve Ditko for Amazing Fantasy #15 (Marvel Comics).Barbie (1959)  Created by Ruth Handler for the toy company Mattel Spock (1964)  Created by Gene Roddenberry for Star Trek. Portrayed most famously by Leonard Nimoy.
Godzilla (1954) Created by Tomoyuki Tanaka, Ishiro Honda, and Eiji Tsubaraya for the film Godzilla.The Joker (1940)  Created by Jerry Robinson, Bill Finger, and Bob Kane for Batman #1 (DC Comics)Winnie-the-Pooh (1924)  Created by A.A. Milne for verse book When We Were Young.Popeye (1929)  Created by E.C. Segar for comic strip Thimble Theater (King Features).Tarzan (1912) Created by Edgar Rice Burroughs for the novel Tarzan of the Apes.Forrest Gump (1986)  Created by Winston Groom for novel Forrest Gump.  Later portrayed by Tom Hanks in Zemeckis’ film adaptation.Hannibal Lector (1981)  Created by Thomas Harris for the novel Red Dragon. Portrayed most famously by Anthony Hopkins in the 1991 Jonathan Demme film The Silence of the Lambs.
Big Bird (1969) Created by Jim Henson and portrayed by Carroll Spinney for Sesame Street.Holden Caulfield (1945) Created by J.D. Salinger for the Collier’s story “I’m Crazy.”  Reworked into the novel The Catcher in the Rye in 1951.Tony Montana (1983)  Created by Oliver Stone for film Scarface.  Portrayed by Al Pacino.Tony Soprano (1999)  Created by David Chase for The Sopranos. Portrayed by James Gandolfini.
The Terminator (1984)  Created by James Cameron and Gale Anne Hurd for The Terminator. Portrayed by Arnold Schwarzenegger.Jon Snow (1996)  Created by George RR Martin for the novel The Game of Thrones.  Portrayed by Kit Harrington.Charles Foster Kane (1941)  Created and portrayed by Orson Welles for Citizen Kane.Scarlett O’Hara (1936)  Created by Margaret Mitchell for the novel Gone With the Wind. Portrayed most famously by Vivien Leigh 
for the 1939 Victor Fleming film adaptation.Marty McFly (1985) Created by Robert Zemeckis and Bob Gale for Back to the Future. Portrayed by Michael J. Fox.Rick Blaine (1940)  Created by Murray Burnett and Joan Alison for the unproduced stage play Everybody Comes to Rick’s. Later portrayed by Humphrey Bogart in Michael Curtiz’s film adaptation Casablanca.Man With No Name (1964)  Created by Sergio Leone for A Fistful of Dollars, which was adapted from a ronin character in Kurosawa’s Yojimbo (1961).  Portrayed by Clint Eastwood.Charlie Brown (1948)  Created by Charles M. Shultz for the comic strip L’il Folks; popularized two years later in Peanuts.E.T. (1982)  Created by Melissa Mathison for the film E.T.: the Extra-Terrestrial.Arthur Fonzarelli (1974)  Created by Bob Brunner for the show Happy Days. Portrayed by Henry Winkler.)Phillip Marlowe (1939)  Created by Raymond Chandler for the novel The Big Sleep.Jay Gatsby (1925)  Created by F. Scott Fitzgerald for the novel The Great Gatsby.Lassie (1938) Created by Eric Knight for a Saturday Evening Post story, later turned into the novel Lassie Come-Home in 1940, film adaptation in 1943, and long-running television show in 1954.  Most famously portrayed by the dog Pal.
Fred Flintstone (1959)  Created by William Hanna and Joseph Barbera for The Flintstones. Voiced most notably by Alan Reed. Rooster Cogburn (1968)  Created by Charles Portis for the novel True Grit. Most famously portrayed by John Wayne in the 1969 film adaptation. Atticus Finch (1960)  Created by Harper Lee for the novel To Kill a Mockingbird.  (Appeared in the earlier work Go Set A Watchman, though this was not published until 2015)  Portrayed most famously by Gregory Peck in the Robert Mulligan film adaptation. Kermit the Frog (1955)  Created and performed by Jim Henson for the show Sam and Friends. Later popularized in Sesame Street (1969) and The Muppet Show (1976) George Bailey (1943)  Created by Phillip Van Doren Stern (then as George Pratt) for the short story The Greatest Gift. Later adapted into Capra’s It’s A Wonderful Life, starring James Stewart as the renamed George Bailey. Yoda (1980) Created by George Lucas for The Empire Strikes Back. Sam Malone (1982)  Created by Glen and Les Charles for the show Cheers.  Portrayed by Ted Danson. Zorro (1919)  Created by Johnston McCulley for the All-Story Weekly pulp magazine story The Curse of Capistrano.Later adapted to the Douglas Fairbanks’ film The Mark of Zorro (1920).Moe, Larry, and Curly (1928)  Created by Ted Healy for the vaudeville act Ted Healy and his Stooges. Mary Poppins (1934)  Created by P.L. Travers for the children’s book Mary Poppins. Ron Burgundy (2004)  Created by Will Ferrell and Adam McKay for the film Anchorman: The Legend of Ron Burgundy.  Portrayed by Will Ferrell. Mario (1981)  Created by Shigeru Miyamoto for the video game Donkey Kong. Harry Potter (1997)  Created by J.K. Rowling for the novel Harry Potter and the Philosopher’s Stone. The Dude (1998)  Created by Ethan and Joel Coen for the film The Big Lebowski. Portrayed by Jeff Bridges.
Gandalf (1937)  Created by J.R.R. Tolkien for the novel The Hobbit. The Grinch (1957)  Created by Dr. Seuss for the story How the Grinch Stole Christmas! Willy Wonka (1964)  Created by Roald Dahl for the children’s novel Charlie and the Chocolate Factory. The Hulk (1962)  Created by Stan Lee and Jack Kirby for The Incredible Hulk #1 (Marvel Comics) Scooby-Doo (1969)  Created by Joe Ruby and Ken Spears for the show Scooby-Doo, Where Are You! George Costanza (1989)  Created by Larry David and Jerry Seinfeld for the show Seinfeld.  Portrayed by Jason Alexander.Jules Winfield (1994)  Created by Quentin Tarantino for the film Pulp Fiction. Portrayed by Samuel L. Jackson. John McClane (1988)  Based on the character Detective Joe Leland, who was created by Roderick Thorp for the novel Nothing Lasts Forever. Later adapted into the John McTernan film Die Hard, starring Bruce Willis as McClane. Ellen Ripley (1979)  Created by Don O’cannon and Ronald Shusett for the film Alien.  Portrayed by Sigourney Weaver. Ralph Kramden (1951)  Created and portrayed by Jackie Gleason for “The Honeymooners,” which became its own show in 1955.Edward Scissorhands (1990)  Created by Tim Burton for the film Edward Scissorhands.  Portrayed by Johnny Depp.Eric Cartman (1992)  Created by Trey Parker and Matt Stone for the animated short Jesus vs Frosty.  Later developed into the show South Park, which premiered in 1997.  Voiced by Trey Parker.
Walter White (2008)  Created by Vince Gilligan for Breaking Bad.  Portrayed by Bryan Cranston. Cosmo Kramer (1989)  Created by Larry David and Jerry Seinfeld for Seinfeld.  Portrayed by Michael Richards.Pikachu (1996)  Created by Atsuko Nishida and Ken Sugimori for the Pokemon video game and anime franchise.Michael Scott (2005)  Based on a character from the British series The Office, created by Ricky Gervais and Steven Merchant.  Portrayed by Steve Carell.Freddy Krueger (1984)  Created by Wes Craven for the film A Nightmare on Elm Street. Most famously portrayed by Robert Englund.
Captain America (1941)  Created by Joe Simon and Jack Kirby for Captain America Comics #1 (Marvel Comics)Goku (1984)  Created by Akira Toriyama for the manga series Dragon Ball Z.Bambi (1923)  Created by Felix Salten for the children’s book Bambi, a Life in the Woods. Later adapted into the Disney film Bambi in 1942.Ronald McDonald (1963) Created by Williard Scott for a series of television spots.Waldo/Wally (1987) Created by Martin Hanford for the children’s book Where’s Wally? (Waldo in US edition) Frasier Crane (1984)  Created by Glen and Les Charles for Cheers.  Portrayed by Kelsey Grammar.Omar Little (2002)  Created by David Simon for The Wire.Portrayed by Michael K. Williams.
Wolverine (1974)  Created by Roy Thomas, Len Wein, and John Romita Sr for The Incredible Hulk #180 (Marvel Comics) Jason Voorhees (1980)  Created by Victor Miller for the film Friday the 13th. Betty Boop (1930)  Created by Max Fleischer and the Grim Network for the cartoon Dizzy Dishes. Bilbo Baggins (1937)  Created by J.R.R. Tolkien for the novel The Hobbit. Tom Joad (1939)  Created by John Steinbeck for the novel The Grapes of Wrath. Later adapted into the 1940 John Ford film and portrayed by Henry Fonda.Tony Stark (Iron Man) (1963)  Created by Stan Lee, Larry Lieber, Don Heck and Jack Kirby for Tales of Suspense #39 (Marvel Comics)Porky Pig (1935)  Created by Friz Freleng for the animated short film I Haven’t Got a Hat. Voiced most famously by Mel Blanc.Travis Bickle (1976)  Created by Paul Schrader for the film Taxi Driver. Portrayed by Robert De Niro.
Hawkeye Pierce (1968)  Created by Richard Hooker for the novel MASH: A Novel About Three Army Doctors.  Famously portrayed by both Alan Alda and Donald Sutherland. Don Draper (2007)  Created by Matthew Weiner for the show Mad Men.  Portrayed by Jon Hamm. Cliff Huxtable (1984)  Created and portrayed by Bill Cosby for The Cosby Show. Jack Torrance (1977)  Created by Stephen King for the novel The Shining. Later adapted into the 1980 Stanley Kubrick film and portrayed by Jack Nicholson. Holly Golightly (1958)  Created by Truman Capote for the novella Breakfast at Tiffany’s.  Later adapted into the 1961 Blake Edwards films starring Audrey Hepburn as Holly. Shrek (1990)  Created by William Steig for the children’s book Shrek! Later adapted into the 2001 film starring Mike Myers as the titular character. Optimus Prime (1984)  Created by Dennis O’Neil for the Transformers toy line.Sonic the Hedgehog (1991)  Created by Naoto Ohshima and Yuji Uekawa for the Sega Genesis game of the same name.Harry Callahan (1971)  Created by Harry Julian Fink and R.M. Fink for the movie Dirty Harry.  Portrayed by Clint Eastwood.Bubble: Hercule Poirot, Tyrion Lannister, Ron Swanson, Cercei Lannister, J.R. Ewing, Tyler Durden, Spongebob Squarepants, The Genie from Aladdin, Pac-Man, Axel Foley, Terry Malloy, Patrick Bateman
Pre-20th Century: Santa Claus, Dracula, Robin Hood, Cinderella, Huckleberry Finn, Odysseus, Sherlock Holmes, Romeo and Juliet, Frankenstein, Prince Hamlet, Uncle Sam, Paul Bunyan, Tom Sawyer, Pinocchio, Oliver Twist, Snow White, Don Quixote, Rip Van Winkle, Ebenezer Scrooge, Anna Karenina, Ichabod Crane, John Henry, The Tooth Fairy,
Br’er Rabbit, Long John Silver, The Mad Hatter, Quasimodo """)


character_matches = matcher(fictional_char_doc)

In [56]:
# Matching positions
character_matches

[(520014689628841516, 56, 57),
 (520014689628841516, 449, 450),
 (520014689628841516, 1352, 1354),
 (520014689628841516, 1365, 1367),
 (520014689628841516, 2084, 2086)]

In [57]:
# Matched items
for match_id, start, end in character_matches:
    span = fictional_char_doc[start:end]
    print(span.text)

Batman
Batman
Harry Potter
Harry Potter
Tony Stark


In [58]:
# Using the attr parameter as 'LOWER'
case_insensitive_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# Creating doc &amp; pattern
my_doc=nlp('I wish to visit new york city')
terms=['New York']
pattern=[nlp(term) for term in terms]

# adding pattern to the matcher
case_insensitive_matcher.add("matcher",None,*pattern)

# applying matcher to the doc
my_matches=case_insensitive_matcher(my_doc)

for match_id,start,end in my_matches:
  span=my_doc[start:end]
  print(span.text)

new york


In [59]:
my_doc = nlp('From 8 am , Mr.X will be speaking on your favorite chanel 191.1. Afterward there shall be an exclusive interview with actor Vijay on channel 194.1 . Hope you are having a great day. Call us on 666666')

In [63]:
pattern=nlp('154.6')

# Initializing the matcher and adding pattern
pincode_matcher= PhraseMatcher(nlp.vocab,attr="SHAPE")
pincode_matcher.add("pincode_matching", [pattern])

# Applying matcher on doc
matches = pincode_matcher(my_doc)

# Printing the matched phrases
for match_id, start, end in matches:
  span = my_doc[start:end]
  print(span.text)

191.1
194.1


In [75]:
# Entity Ruler
from spacy.pipeline import EntityRuler

# Initialize
ruler = EntityRuler(nlp)

pattern=[{"label": "WORK_OF_ART", "pattern": "My guide to statistics"}]

ruler.add_patterns(pattern)


# Add entity ruler to the NLP pipeline. 
# NLP pipeline is a sequence of NLP tasks that spaCy performs for a given text
nlp.add_pipe(ruler)

# Extract the custom entity type 
doc = nlp(" I recently published my work fanfiction by Dr.X . Right now I'm studying the book of my friend .You should try My guide to statistics for clear concepts.")
print([(ent.text, ent.label_) for ent in doc.ents])

[]


In [77]:
review_1=nlp(' The food was amazing')
review_2=nlp('The food was excellent')
review_3=nlp('I did not like the food')
review_4=nlp('It was very bad experience')

score_1=review_1.similarity(review_2)
print('Similarity between review 1 and 2',score_1)

score_2=review_1.similarity(review_4)
print('Similarity between review 3 and 4',score_2)

Similarity between review 1 and 2 0.8043222830391739
Similarity between review 3 and 4 0.37536336289068906


  
  if __name__ == '__main__':


In [78]:
# Compute Similarity between texts 
pizza=nlp('pizza')
burger=nlp('burger')
chair=nlp('chair')

print('Pizza and burger  ',pizza.similarity(burger))
print('Pizza and chair  ',pizza.similarity(chair))

Pizza and burger   0.6785067815137173
Pizza and chair   0.8884922946704111


  
  import sys
