In [1]:
import spacy
from pprint import pprint
from spacy import displacy
from collections import Counter
import en_core_web_sm
import pandas as pd

In [2]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = en_core_web_sm.load()

text = """European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices"""
#  "nlp" Object is used to create documents with linguistic annotations.
# doc = nlp(text)
doc = nlp.tokenizer(text)

# Create list of word tokens
token_list = []
for token in doc:
    token_list.append(token.text)
print(token_list)



['European', 'authorities', 'fined', 'Google', 'a', 'record', '$', '5.1', 'billion', 'on', 'Wednesday', 'for', 'abusing', 'its', 'power', 'in', 'the', 'mobile', 'phone', 'market', 'and', 'ordered', 'the', 'company', 'to', 'alter', 'its', 'practices']


In [3]:
# sentence tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices"""
# create list of sentence tokens
d=nlp(text)
sents_list = []
for sent in d.sents:
    sents_list.append(sent.text)
print(sents_list)

['European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices']


In [4]:
#Stop words
#importing stop words from English language.
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['amount', 'who', 'just', 'amongst', 'by', 'though', 'twelve', 'side', 'beyond', 'above', 'bottom', 'itself', 'seems', 'other', 'up', 'another', 'whereupon', 'few', 'now', 'last']


In [5]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

# filtering stop words
for word in d:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [European, authorities, fined, Google, record, $, 5.1, billion, Wednesday, abusing, power, mobile, phone, market, ordered, company, alter, practices]


In [6]:
#part-of-speech taging 
nlp = en_core_web_sm.load()
data = []
for token in d:
    data.append([token.text,token.lemma_ ,token.pos_ ,token.tag_ ,token.dep_,
         token.shape_, token.is_alpha, token.is_stop])
    #text | lemma | pos | tag | dep | shape | alpha | stop 

pd.DataFrame(data)

Unnamed: 0,0,1,2,3,4,5,6,7
0,European,European,,,,Xxxxx,True,False
1,authorities,authorities,,,,xxxx,True,False
2,fined,fined,,,,xxxx,True,False
3,Google,Google,,,,Xxxxx,True,False
4,a,a,,,,x,True,True
5,record,record,,,,xxxx,True,False
6,$,$,,,,$,False,False
7,5.1,5.1,,,,d.d,False,False
8,billion,billion,,,,xxxx,True,False
9,on,on,,,,xx,True,True


In [None]:
#Visualizing the dependency parse
nlp = en_core_web_sm.load()
displacy.render(nlp(doc))   #syntactic dependies

In [8]:
#dependency parsing 
        #noun chunks - "based noun phrases -flat phrases that have a noun as their head"\
data_ = []
for chunk in doc.noun_chunks:
    data_.append([chunk.text,chunk.root.text, chunk.root.dep_,
         chunk.root.head.text])    # text | root text | root dep | root head text 

pd.DataFrame(data_)

ValueError: [E029] noun_chunks requires the dependency parse, which requires a statistical model to be installed and loaded. For more info, see the documentation:
https://spacy.io/usage/models

In [None]:
#Navigating the parse tree

#head  #child  #dep
data__ =[]
for token in doc :
    data__.append([token.text,token.dep_,token.head.text,token.head.pos,
                  [child for child in token.children]])  #text | dep | head text | head pos | children

pd.DataFrame(data__)
    

In [None]:
#Nmaed entity recoginition 
_data_ =[]
for ent in doc.ents:
    _data_.append([ent.text,ent.start_char,ent.end_char,ent.label_])

pd.DataFrame(_data_)


In [None]:
#visualizing named entity
pprint([(X.text, X.label_) for X in doc.ents])

displacy.render(doc,style='ent', jupyter=True)

In [None]:
#token
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

In [None]:
#extracting named entity from an article
from bs4 import BeautifulSoup
import requests
import re
text_file =open("corpus.txt","w",encoding="utf-8")
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
        
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
text_file.write(ny_bb)
article = nlp(ny_bb)
len(article.ents)

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

In [None]:
sentences = [x for x in article.sents]
print(sentences[10])

In [None]:
displacy.render(nlp(str(sentences[10])), jupyter=True, style='ent')

In [None]:
displacy.render(nlp(str(sentences[10])), style='dep', jupyter = True, options = {'distance': 120})

In [None]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[10])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

In [None]:
dict([(str(x), x.label_) for x in nlp(str(sentences[10])).ents])

In [None]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'europe')
print(mango.vector.shape)
print(mango.vector)

In [None]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[10]])

In [None]:
displacy.render(article,jupyter=True,style='ent')

In [None]:
import dash_dangerously_set_inner_html as insert_html
import dash
import dash_core_components as dcc
import dash_html_components as html
import spacy
from dash.dependencies import Input, Output
from pprint import pprint
from spacy import displacy
from collections import Counter
import en_core_web_sm
import pandas as pd
nlp = en_core_web_sm.load()

file = open('corpus.txt',encoding="utf-8").read()
doc = nlp(file)


external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)


def Calculate(doc):
    #text | lemma | pos | tag | dep | shape | alpha | stop
    speech_part_tag = [[token.text, token.lemma_, token.pos_, token.tag_,
                        token.dep_, token.shape_, token.is_alpha, token.is_stop] for token in doc]
    nouns = [[chunk.text, chunk.root.text, chunk.root.dep_,
              chunk.root.head.text] for chunk in doc.noun_chunks]
    # text | root text | root dep | root head text
    _entity_ = [[ent.text, ent.start_char, ent.end_char, ent.label_]
                for ent in doc.ents]
    html = displacy.render(doc, style='ent', jupyter=False)
    html1 = displacy.render(doc,style='dep', jupyter=False)
    return html, html1, _entity_, nouns, speech_part_tag


def generate_table(dataframe, max_rows=20):
    return html.Table(
        # Header
        [html.Tr([html.Th(col) for col in dataframe.columns])] +

        # Body
        [html.Tr([
            html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
        ]) for i in range(min(len(dataframe), max_rows))]
    )


Html, Html1, _entity_, nouns, speech_part_tag = Calculate(doc)


app.layout = html.Div(id="display", children=[
    html.H1(children='Feature Extraction'),
    html.Div(children=file),

    dcc.Tabs(id="tabs", value='tab-1', children=[
        dcc.Tab(label='Tab one', value='tab-1'),
        dcc.Tab(label='Tab two', value='tab-2'),
        dcc.Tab(label='Tab three', value='tab-3'),
        dcc.Tab(label='Color Represntion', value='tab-4'),
    ]),
    html.Div(id='tabs-content')

])


@app.callback(Output('tabs-content', 'children'),
              [Input('tabs', 'value')])
def render_content(tab):
    if tab == 'tab-1':
        return generate_table(pd.DataFrame(speech_part_tag, columns='text | lemma | pos | tag | dep | shape | alpha | stop '.split(" | ")))
    elif tab == 'tab-2':
        return html.Div([generate_table(pd.DataFrame(_entity_))])
    elif tab == 'tab-3':
        return html.Div([generate_table(pd.DataFrame(nouns, columns=' text | root text | root dep | root head text'.split(" | ")))])
    elif tab == 'tab-4':
        return html.Div(insert_html.DangerouslySetInnerHTML(Html))

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)
    #app.run(debug=True, )
