In [20]:
import spacy
from spacy import displacy

In [22]:
nlp = spacy.load('en_core_web_sm')

In [85]:
doc = nlp(u'The stripline is shrouded by fragments')
displacy.render(doc, style='dep', jupyter=True)

#### Stemming

In [21]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

In [70]:
words = ["лес", "лесной", "лесник", "лесничий", "лесничество", "пролесье", 
         "flaw", "flaws", "flawed", "flawless", "flawlessness", "flawlessly", "flawness", 
         "окно", "окошко", "подоконник", "оконный", "окнище"]
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer("russian")
l_stemmer = LancasterStemmer()
for word in words:
    print(f"{word}\n\nPorter: {p_stemmer.stem(word)}\nSnowball: {s_stemmer.stem(word)}\nLancaster: {l_stemmer.stem(word)}\n")

лес

Porter: лес
Snowball: лес
Lancaster: лес

лесной

Porter: лесной
Snowball: лесн
Lancaster: лесной

лесник

Porter: лесник
Snowball: лесник
Lancaster: лесник

лесничий

Porter: лесничий
Snowball: леснич
Lancaster: лесничий

лесничество

Porter: лесничество
Snowball: лесничеств
Lancaster: лесничество

пролесье

Porter: пролесье
Snowball: пролес
Lancaster: пролесье

flaw

Porter: flaw
Snowball: flaw
Lancaster: flaw

flaws

Porter: flaw
Snowball: flaws
Lancaster: flaw

flawed

Porter: flaw
Snowball: flawed
Lancaster: flaw

flawless

Porter: flawless
Snowball: flawless
Lancaster: flawless

flawlessness

Porter: flawless
Snowball: flawlessness
Lancaster: flawless

flawlessly

Porter: flawlessli
Snowball: flawlessl
Lancaster: flawless

flawness

Porter: flaw
Snowball: flawness
Lancaster: flaw

окно

Porter: окно
Snowball: окн
Lancaster: окно

окошко

Porter: окошко
Snowball: окошк
Lancaster: окошко

подоконник

Porter: подоконник
Snowball: подоконник
Lancaster: подоконник

оконный

Porte

#### headlines

In [23]:
from pathlib import Path
import regex as re
import spacy
from spacy.tokenizer import Tokenizer

In [205]:
infixes = (
 '\\.\\.+',
 '…',
 '[\\p{So}]',
 '(?<=[0-9])[+\\\*^](?=[0-9])',
 '(?<=[[[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]])\\.(?=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]])',
 '(?<=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]]),(?=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]])',
 '(?<=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]])[?";:=,.]*(?:–|—|--|---|——|~)(?=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]])',
 '(?<=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]"])[:<>=/](?=[[[\\p{Lu}&&\\p{Latin}]||[ЁА-Я]||[\\p{Ll}&&\\p{Latin}]||[ёа-я]||[\\p{L}&&\\p{Bengali}]||[\\p{L}&&\\p{Hebrew}]||[\\p{L}&&\\p{Arabic}]]])')

In [207]:
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = spacy.util.compile_infix_regex(infixes) #re.compile(r'''[~]''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer)

In [208]:
nlp = spacy.load('en')
nlp.tokenizer = custom_tokenizer(nlp)

In [191]:
CAPS = ["NOUN", "PRON", "PROPN", "ADJ", "ADV", "VERB", "SCONJ"]

In [192]:
def format_headline(text):
    doc = nlp(text)
    output = []
    for token in doc:
        if "-" in token.text:
            parts = token.text_with_ws.split("-")
            word = '-'.join(map(lambda x: x.capitalize(), parts))
            output.append(word)
        else:
            if token.pos_ in CAPS:
                output.append(token.text_with_ws.capitalize())
            else:
                output.append(token.text_with_ws.lower())
    output[0] = output[0].capitalize()
    output[-1] = output[-1].capitalize()
    return "".join(output)

In [211]:
format_headline("Back to school, gluten-free style")

'Back to School, Gluten-Free Style'

In [18]:
path = Path('/home/karimlulu/repos/prj-nlp') / "tasks" / "02-structural-linguistics"
filename = "examiner-headlines.txt"
data = [line.strip() for line in (path / filename).open().readlines()]
print(len(data))

5000


In [218]:
s = "Pamper Yourself like a Pharaoh at the Ritz Carlton Denver"
format_headline(s)==s

True

In [219]:
i = 0
for k,line in enumerate(data):
    if line==format_headline(line):
        i += 1
    if k%500==0:
        print(f"Processed: {k}")

Processed: 0
Processed: 500
Processed: 1000
Processed: 1500
Processed: 2000
Processed: 2500
Processed: 3000
Processed: 3500
Processed: 4000
Processed: 4500


In [243]:
from nltk.corpus import sentiwordnet as swn

In [235]:
def prominence(doc):
    return doc.ents

In [280]:
mapping = {"NOUN": "n", 
           "VERB": "v", 
           "ADJ": "a",
           "ADV": "r"}

In [371]:
EXCLUDE = ["PUNCT", "SYM", "SPACE", "X"]

In [420]:
def sentiment(doc, top=5, threshold=0.5):
    mean_pos_sentiment = 0
    for k, token in enumerate(doc):
        if token.pos_ not in EXCLUDE:
            synsets = list(swn.senti_synsets(token.text, pos=mapping.get(token.pos_)))[:top]
            token_pos_sentiment = sum(synset.pos_score() for synset in synsets) / len(synsets) if synsets else 0
            mean_pos_sentiment = (k * mean_pos_sentiment + token_pos_sentiment) / (k + 1)
    return mean_pos_sentiment >= threshold

In [421]:
def superlativeness(doc):
    output = 0
    for token in doc:
        if token.pos_ in ["ADJ", "ADV"]:
            if token.tag_ in ["JJS", "RBS"]:
                output += 1
    return output

In [433]:
def check_catchy(text, top=5, threshold=0.5):
    doc = nlp(text)
    ner, pos_sentiment, superlat = prominence(doc), sentiment(doc, top=top, threshold=threshold), superlativeness(doc)
    if ner or pos_sentiment or superlat:
        return True
    return False

In [441]:
word = "BASIC BABY CHECKLIST"
doc = nlp(word)
print(list(doc))
sentiment(doc), superlativeness(doc), prominence(doc), check_catchy(word)

[BASIC, BABY, CHECKLIST]


(False, 0, (), False)

In [436]:
i = 0
for k,line in enumerate(data[:10]):
    is_catchy = check_catchy(line)
    print(line, is_catchy)
    i += is_catchy
    if k%500==0:
        print(f"Processed: {k}")

Halep enters Rogers Cup final in straight sets win over Errani True
Processed: 0
The phantoms of St. Mary's True
Talladega turmoil could spell trouble for NASCAR's Chase field True
Burn those calories! Try the Very Steep Trail. False
It's the end of the world... and I feel fine False
2011-2012 NHL team preview: Detroit Red Wings True
Cal coach Jeff Tedford taking a different approach in 2010 -- Part 1 True
Google science fair to encourage STEM learning True
SF Beer Week 2013: what's for dinner (part 2) True
Jersey Shore Season 6 cast's salaries revealed; More than President Obama! True


In [442]:
MAPPING = {"NOUN": "n", 
           "VERB": "v", 
           "ADJ": "a",
           "ADV": "r"}
EXCLUDE = ["PUNCT", "SYM", "SPACE", "X"]

def prominence(doc):
    return doc.ents

def sentiment(doc, top=5, threshold=0.5):
    mean_pos_sentiment = 0
    for k, token in enumerate(doc):
        if token.pos_ not in EXCLUDE:
            synsets = list(swn.senti_synsets(token.text, pos=MAPPING.get(token.pos_)))[:top]
            token_pos_sentiment = sum(synset.pos_score() for synset in synsets) / len(synsets) if synsets else 0
            mean_pos_sentiment = (k * mean_pos_sentiment + token_pos_sentiment) / (k + 1)
    return mean_pos_sentiment >= threshold

def superlativeness(doc):
    output = 0
    for token in doc:
        if token.pos_ in ["ADJ", "ADV"]:
            if token.tag_ in ["JJS", "RBS"]:
                output += 1
    return output

def check_catchy(text, top=5, threshold=0.5):
    doc = nlp(text)
    ner = prominence(doc)
    pos_sentiment = sentiment(doc, top=top, threshold=threshold)
    superlat = superlativeness(doc)
    if ner or pos_sentiment or superlat:
        return True
    return False

In [445]:
nlp = spacy.load('en')output.append(f"{line}\n")

In [447]:
check_catchy(word)

False

#### Collocations

In [142]:
from collections import defaultdict, Counter
from functools import reduce
from time import time

In [15]:
WORDS = ["say", "tell", "speak", "claim", "communicate",
         "narrate", "declare", "respond"]

In [102]:
nlp = spacy.load('en_core_web_sm', disable=["ner", "textcat"])

In [112]:
filename = "blog2008.txt"
with (path / filename).open() as f:
    data = f.readlines()
    data = [line.strip() for line in data]

In [147]:
def find_verb(doc, verbs=WORDS, patt="ly"):
    output = defaultdict(list)
    for token in doc:
        if token.lemma_ in verbs and token.pos_ == "VERB":
            adverbs = filter(lambda x: x.pos_=="ADV" and x.text.endswith(patt), token.children)
            output[token.lemma_].extend([el.text.lower() for el in adverbs])
    return output

In [137]:
def merge_dicts(a, b):
    for key, value in b.items():
        a[key].extend(value)
    return a

In [159]:
rez = (find_verb(doc) for doc in nlp.pipe(data[:10000], batch_size=20000, n_threads=-1))

In [160]:
t0 = time()
rez = reduce(merge_dicts, rez)
print(f"Spent: {time()-t0:.2f}")

Spent: 45.36


In [163]:
', '.join(str(el) for el in output.values())

"['continually'], ['definitively', 'recently', 'absolutely', 'really', 'Really'], [], ['initially'], [], ['overtly', 'subsequently'], []"

In [165]:
frq_collocations = {}
for key, value in rez.items():
    frq_collocations[key] = Counter(value).most_common(10)

In [169]:
", ".join(str(el) for el in frq_collocations["claim"])

"('initially', 1), ('brazenly', 1), ('surely', 1)"

In [107]:
t0 = time()
output = defaultdict(list)
for k, line in enumerate(data[:2000]):
    doc = nlp(line.strip())
    output = find_verb(doc, output=output)
    if k % 1000 == 0:
        print(f"Processed: {k}")
print(f"Spent: {time()-t0:.2f}")

Processed: 0
Processed: 1000
Spent: 28.65


In [141]:
len(data)

303994

In [96]:
output

defaultdict(list,
            {'claim': ['initially'],
             'communicate': [],
             'declare': [],
             'respond': [],
             'say': ['definitively',
              'recently',
              'absolutely',
              'really',
              'Really'],
             'speak': ['overtly', 'subsequently'],
             'tell': ['continually']})

In [56]:
x = doc[1]
list(x.children)[-1].pos_, x.lemma_

('ADV', 'speak')

In [26]:
PREFIX = "shm"
vowels =  ('a', 'e', 'i', 'o', 'u')

In [48]:
def shmificate(word, prefix=PREFIX):
    is_capitalized = word[0].isupper()
    word = word.lower()
    if word.startswith(PREFIX):
        return word
    if "sh" in word:
        prefix = "sm"
    i = 0
    for letter in word:
        if letter not in vowels:
            i += 1
        else:
            break
    prefix = prefix.capitalize() if is_capitalized else prefix
    output = prefix + word[i:]
    return output

In [50]:
words = ["table", "apple", "shmaltz", "Ashmont", "truncate"]
[shmificate(word) for word in words]

['shmable', 'shmapple', 'shmaltz', 'Smashmont', 'shmuncate']