# Sentiment analysis

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

-----

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.dummy import DummyClassifier

In [None]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

In [None]:
baseline = DummyClassifier()
baseline.fit(train['text'], train['sentiment'])
print(accuracy_score(test['sentiment'], baseline.predict(test['text'])))

-----

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
import multiprocessing as mp

In [None]:
def get_compound(text):
    return analyzer.polarity_scores(text)['compound']

with mp.Pool() as p:
    train['vader'] = list(p.imap(get_compound, tqdm(train['text']), chunksize=500))
    test['vader'] = list(p.imap(get_compound, tqdm(test['text']), chunksize=500))

---

In [None]:
import matplotlib.pyplot as plt

In [None]:
train.boxplot('vader', by='overall', grid=False, showfliers=False)
plt.ylabel('VADER composite')

In [None]:
accuracy_score((train['vader']>0.75),(train['sentiment']=='good'))

In [None]:
xs = np.linspace(-1.0, 1.0, 50)
ys =[accuracy_score((train['vader']>x),(train['sentiment']=='good')) for x in xs]

In [None]:
plt.plot(xs, ys)
plt.xlabel('VADER cutoff')
plt.ylabel('Train acc.')

In [None]:
xs[np.argmax(ys)]

In [None]:
thresh = _
accuracy_score((train['vader']>thresh),(train['sentiment']=='good'))

In [None]:
accuracy_score((test['vader']>thresh),(test['sentiment']=='good'))

---

In [None]:
train[(train['vader']>.9)&(train['sentiment']=='bad')]['text'].iloc[0]

In [None]:
analyzer.polarity_scores(_)

In [None]:
train[(train['vader']>.9)&(train['sentiment']=='bad')]['text'].iloc[1]

In [None]:
analyzer.polarity_scores(_)

In [None]:
train[(train['vader']>.9)&(train['sentiment']=='bad')]['text'].iloc[2]

In [None]:
analyzer.polarity_scores(_)

In [None]:
def get_diff(text):
    scores = analyzer.polarity_scores(text)
    return scores['pos'] - scores['neg']

with mp.Pool() as p:
    train['vader_diff'] = list(p.imap(get_diff, tqdm(train['text']), chunksize=500))
    test['vader_diff'] = list(p.imap(get_diff, tqdm(test['text']), chunksize=500))

In [None]:
train.boxplot('vader_diff', by='overall', grid=False, showfliers=False)
plt.ylabel('VADER pos - VADER neg')

In [None]:
xs = np.linspace(-1.0, 1.0, 100)
ys =[accuracy_score((train['vader_diff']>x),(train['sentiment']=='good')) for x in xs]

plt.plot(xs, ys)
plt.xlabel('VADER cutoff')
plt.ylabel('Train acc.')

In [None]:
thresh = xs[np.argmax(ys)]
thresh

In [None]:
accuracy_score((test['vader_diff']>thresh),(test['sentiment']=='good'))