In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
comments = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
valid = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
sample_submission = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")

In [None]:
newlines = re.compile("\n+")

def clean_text(t):
    return re.sub(newlines, " ", t).strip()

In [None]:
class VaderToxicity:
    def __init__(self):
        self.sid = SentimentIntensityAnalyzer()
        
    def score(self, t):
        ss = tokenize.sent_tokenize(t)
        scores = np.array([list(self.sid.polarity_scores(s).values()) for s in ss])[:, :3]
        scores = scores.mean(axis=0)
        return (scores[0] - scores[2])/(scores[1]+1e-3)

In [None]:
vt = VaderToxicity()

In [None]:
def clean_score(t):
    return vt.score(clean_text(t))

In [None]:
valid["more_toxic_score"] = valid["more_toxic"].map(clean_score)
valid["less_toxic_score"] = valid["less_toxic"].map(clean_score)

In [None]:
np.mean(valid["more_toxic_score"] > valid["less_toxic_score"])

In [None]:
bootstrap_estimates = []
for i in range(1000):
    df = valid.sample(replace=True, frac=1.0)
    stat = df.eval("more_toxic_score > less_toxic_score").mean()
    bootstrap_estimates.append(stat)
np.quantile(bootstrap_estimates, [0, 0.01, 0.025, 0.975, 0.99, 1.]).round(3)    

In [None]:
comments["score"] = comments["text"].map(clean_score)

In [None]:
submission = comments[["comment_id", "score"]]

In [None]:
assert (submission.columns == sample_submission.columns).all()
assert submission.shape == sample_submission.shape

In [None]:
submission.to_csv("submission.csv",index=False)