# Sentiment analysis

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

In [None]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

In [None]:
docs = DocBin().from_disk('parsed.docbin')
df['doc'] = list(docs.get_docs(nlp.vocab))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train,test = train_test_split(df,
                             test_size=0.2,
                             stratify=df['sentiment'],
                             random_state=619)

----

In [None]:
displacy.render(nlp("They didn't have any clean towels."))

In [None]:
from spacy.tokens import Token
Token.set_extension('neg', default=False)

In [None]:
for doc in df['doc']:
    for t in doc:
        if t.dep_ == 'neg':
            t.head._.neg = True

In [None]:
def add_neg(token):
    return 'NOT:'+token.norm_ if token._.neg else token.norm_

In [None]:
def tokenize(doc):
    return [add_neg(t) for t in doc]

In [None]:
m1 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m1.fit(train['doc'], train['sentiment'])
m1.score(test['doc'], test['sentiment'])

In [None]:
def print_top_feats(M, k=0):
    V = M.named_steps['countvectorizer'].get_feature_names()
    coef = M.named_steps['sgdclassifier'].coef_[0]
    order = coef.argsort()
    for w1, w2 in zip(order[-k:][::-1],order[:k]):
        print(f'{V[w1]:20s} {coef[w1]:7.3f} | {V[w2]:20s} {coef[w2]:7.3f}')

In [None]:
print_top_feats(m1, 25)

In [None]:
def negify(tok):
    tok._.neg = True
    for child in tok.children:
        negify(child)

In [None]:
for doc in df['doc']:
    for t in doc:
        t._.neg = False
    for t in doc:        
        if t.dep_ == 'neg':
            t.head._.neg = True
            for r in t.head.rights:
                if r.dep_ in ['acomp', 'advmod', 'attr', 'dobj', 'prep', 'xcomp']:
                    negify(r)

In [None]:
m2 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m2.fit(train['doc'], train['sentiment'])
m2.score(test['doc'], test['sentiment'])

In [None]:
print_top_feats(m2, 25)

In [None]:
def mod_tokenizer(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc 
            if w.dep_ in ['amod', 'advmod'] ] + \
            [ add_neg(w) for w in doc]

In [None]:
mod_tokenizer(nlp("The didn't have any clean towels."))

In [None]:
m3 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=mod_tokenizer),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m3.fit(train['doc'], train['sentiment'])
m3.score(test['doc'], test['sentiment'])

In [None]:
print_top_feats(m3, 25)

In [None]:
def everything(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc ] + \
            [ add_neg(w) for w in doc]

In [None]:
everything(nlp("The didn't have any clean towels."))

In [None]:
m4 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=everything),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m4.fit(train['doc'], train['sentiment'])
m4.score(test['doc'], test['sentiment'])

In [None]:
print_top_feats(m4, 50)