In [1]:
import spacy
import indic

In [2]:
m = spacy.load('en_core_web_sm')

In [3]:
import nltk

In [4]:
sentence = "The little yellow dog barked at the cat"
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sentence)

In [5]:
tokens

['The', 'little', 'yellow', 'dog', 'barked', 'at', 'the', 'cat']

In [6]:
tags = nltk.pos_tag(tokens,tagset = "universal")
print(tags)

[('The', 'DET'), ('little', 'ADJ'), ('yellow', 'ADJ'), ('dog', 'NOUN'), ('barked', 'VERB'), ('at', 'ADP'), ('the', 'DET'), ('cat', 'NOUN')]


In [7]:
grammar = "NP: {<DET>?<ADJ>*<NOUN>}"

In [8]:
cp = nltk.RegexpParser(grammar)

In [9]:
result = cp.parse(tags)
tags

[('The', 'DET'),
 ('little', 'ADJ'),
 ('yellow', 'ADJ'),
 ('dog', 'NOUN'),
 ('barked', 'VERB'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('cat', 'NOUN')]

In [10]:
result.draw()

In [11]:
print(result)

(S
  (NP The/DET little/ADJ yellow/ADJ dog/NOUN)
  barked/VERB
  at/ADP
  (NP the/DET cat/NOUN))


In [12]:
import pandas as pd
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
passage ="Formula One (more commonly known as Formula 1 or F1) is the highest class of international racing for open-wheel single-seater formula racing cars sanctioned by the Fédération Internationale de l'Automobile (FIA). The FIA Formula One World Championship has been one of the premier forms of racing around the world since its inaugural season in 1950. The word formula in the name refers to the set of rules to which all participants' cars must conform. A Formula One season consists of a series of races, known as Grands Prix. Grands Prix take place in multiple countries and continents around the world on either purpose-built circuits or closed public roads."

sentences = nltk.sent_tokenize(passage)
tagged_sentences = []
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    tagged_sentences.append(tags)

In [14]:
for taggedSentence in tagged_sentences:
    result = cp.parse(taggedSentence)
    print(result)
    print("\n\n")

(S
  Formula/NNP
  One/CD
  (/(
  more/RBR
  commonly/RB
  known/VBN
  as/IN
  Formula/NNP
  1/CD
  or/CC
  F1/NNP
  )/)
  is/VBZ
  the/DT
  highest/JJS
  class/NN
  of/IN
  international/JJ
  racing/NN
  for/IN
  open-wheel/JJ
  single-seater/JJ
  formula/NN
  racing/VBG
  cars/NNS
  sanctioned/VBN
  by/IN
  the/DT
  Fédération/NNP
  Internationale/NNP
  de/FW
  l'Automobile/FW
  (/(
  FIA/NNP
  )/)
  ./.)



(S
  The/DT
  FIA/NNP
  Formula/NNP
  One/NNP
  World/NNP
  Championship/NNP
  has/VBZ
  been/VBN
  one/CD
  of/IN
  the/DT
  premier/JJR
  forms/NNS
  of/IN
  racing/VBG
  around/IN
  the/DT
  world/NN
  since/IN
  its/PRP$
  inaugural/JJ
  season/NN
  in/IN
  1950/CD
  ./.)



(S
  The/DT
  word/NN
  formula/NN
  in/IN
  the/DT
  name/NN
  refers/NNS
  to/TO
  the/DT
  set/NN
  of/IN
  rules/NNS
  to/TO
  which/WDT
  all/DT
  participants/NNS
  '/POS
  cars/NNS
  must/MD
  conform/VB
  ./.)



(S
  A/DT
  Formula/NNP
  One/CD
  season/NN
  consists/VBZ
  of/IN
  a/DT
  series/N