In [1]:
import os
import gc
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import markdown
import json
import requests
import warnings
import time

from colorama import Fore, Back, Style, init

%matplotlib inline
import pandas as pd
import pickle
import re
import time
from collections import Counter, defaultdict
import numpy as np
import csv
import random
from scipy import stats
from termcolor import colored
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
try:
    from html.parser import HTMLParser
except ImportError:
    from HTMLParser import HTMLParser

In [49]:
#create wrapper for perspective API
# code copied from https://github.com/Conway/perspective


def validate_language(language):
    # ISO 639-1 code validation
    # language source: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
    codes = ["ab", "aa", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay",
             "az", "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs",
             "ca", "ce", "ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de",
             "dv", "dz", "ee", "el", "en", "eo", "es", "et", "eu", "fa", "ff",
             "fi", "fj", "fo", "fr", "fy", "ga", "gd", "gl", "gn", "gu", "gv",
             "ha", "he", "hi", "ho", "hr", "ht", "hu", "hy", "hz", "ia", "id",
             "ie", "ig", "ii", "ik", "io", "is", "it", "iu", "ja", "jv", "ka",
             "kg", "ki", "kj", "kk", "kl", "km", "kn", "ko", "kr", "ks", "ku",
             "kv", "kw", "ky", "la", "lb", "lg", "li", "ln", "lo", "lt", "lu",
             "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my",
             "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv", "ny",
             "oc", "oj", "om", "or", "os", "pa", "pi", "ps", "pt", "qu", "rm",
             "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk",
             "sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw",
             "ta", "te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts",
             "tt", "tw", "ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa",
             "wo", "xh", "yi", "yo", "za", "zh", "zu"]
    return language.lower() in codes


def remove_html(text, md=False):
    if md:
        text = markdown.markdown(text)
    # credit: stackoverflow
    class MLStripper(HTMLParser):
        def __init__(self):
            super().__init__()
            self.reset()
            self.strict = False
            self.convert_charrefs= True
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ''.join(self.fed)

    s = MLStripper()
    s.feed(text)
    return s.get_data()

# allowed test types
allowed = ["TOXICITY",
           "SEVERE_TOXICITY",
           "TOXICITY_FAST",
           "ATTACK_ON_AUTHOR",
           "ATTACK_ON_COMMENTER",
           "INCOHERENT",
           "INFLAMMATORY",
           "OBSCENE",
           "OFF_TOPIC",
           "UNSUBSTANTIAL",
           "LIKELY_TO_REJECT"]

class Perspective(object):

    base_url = "https://commentanalyzer.googleapis.com/v1alpha1"

    def __init__(self, key):
        self.key = key

    def score(self, text, tests=["TOXICITY"], context=None, languages=None, do_not_store=False, token=None, text_type=None):
        # data validation
        # make sure it's a valid test
        # TODO: see if an endpoint that has valid types exists
        if isinstance(tests, str):
            tests = [tests]
        if not isinstance(tests, (list, dict)) or tests is None:
            raise ValueError("Invalid list/dictionary provided for tests")
        if isinstance(tests, list):
            new_data = {}
            for test in tests:
                new_data[test] = {}
            tests = new_data
        if text_type:
            if text_type.lower() == "html":
                text = remove_html(text)
            elif text_type.lower() == "md":
                text = remove_html(text, md=True)
            else:
                raise ValueError("{0} is not a valid text_type. Valid options are 'html' or 'md'".format(str(text_type)))

        for test in tests.keys():
            if test not in allowed:
                warnings.warn("{0} might not be accepted as a valid test.".format(str(test)))
            for key in tests[test].keys():
                if key not in ["scoreType", "scoreThreshhold"]:
                    raise ValueError("{0} is not a valid sub-property for {1}".format(key, test))

        # The API will only grade text less than 3k characters long
        if len(text) > 3000:
            warnings.warn("Perspective only allows 3000 character strings. Only the first 3000 characters will be sent for processing")
            text = text[:3000]
        new_langs = []
        if languages:
            for language in languages:
                language = language.lower()
                if validate_language(language):
                    new_langs.append(language)

        # packaging data
        url = Perspective.base_url + "/comments:analyze"
        querystring = {"key": self.key}
        payload_data = {"comment": {"text": text}, "requestedAttributes": {}}
        for test in tests.keys():
            payload_data["requestedAttributes"][test] = tests[test]
        if new_langs != None:
            payload_data["languages"] = new_langs
        if do_not_store:
            payload_data["doNotStore"] = do_not_store
        payload = json.dumps(payload_data)
        headers = {'content-type': "application/json"}
        response = requests.post(url,
                            data=payload,
                            headers=headers,
                            params=querystring)
        data = response.json()
        if "error" in data.keys():
            raise PerspectiveAPIException(data["error"]["message"])
        c = Comment(text, [], token)
        base = data["attributeScores"]
        for test in tests.keys():
            score = base[test]["summaryScore"]["value"]
            score_type = base[test]["summaryScore"]["type"]
            a = Attribute(test, [], score, score_type)
            for span in base[test]["spanScores"]:
                beginning = span["begin"]
                end = span["end"]
                score = span["score"]["value"]
                score_type = span["score"]["type"]
                s = Span(beginning, end, score, score_type, c)
                a.spans.append(s)
            c.attributes.append(a)
        return c

class Comment(object):
    def __init__(self, text, attributes, token):
        self.text = text
        self.attributes = attributes
        self.token = token

    def __getitem__(self, key):
        if key.upper() not in allowed:
            raise ValueError("value {0} does not exist".format(key))
        for attr in self.attributes:
            if attr.name.lower() == key.lower():
                return attr
        raise ValueError("value {0} not found".format(key))

    def __str__(self):
        return self.text

    def __repr__(self):
        count = 0
        num = 0
        for attr in self.attributes:
            count += attr.score
            num += 1
        return "<({0}) {1}>".format(str(count/num), self.text)

    def __iter__(self):
        return iter(self.attributes)

    def __len__(self):
        return len(self.text)

class Attribute(object):
    def __init__(self, name, spans, score, score_type):
        self.name = name
        self.spans = spans
        self.score = score
        self.score_type = score_type

    def __getitem__(self, index):
        return self.spans[index]

    def __iter__(self):
        return iter(self.spans)

class Span(object):
    def __init__(self, begin, end, score, score_type, comment):
        self.begin = begin
        self.end = end
        self.score = score
        self.score_type = score_type
        self.comment = comment

    def __str__(self):
        return self.comment.text[self.begin:self.end]

    def __repr__(self):
        return "<({0}) {1}>".format(self.score, self.comment.text[self.begin:self.end])

class PerspectiveAPIException(Exception):
    pass

In [50]:
p = Perspective("AIzaSyDykqOfDYdE01O1fiUPsu_NiBIMmRm4Ib4")

In [51]:
comment = p.score("This is a comment", tests=["TOXICITY"])
print("Toxicity score: " + str(comment["TOXICITY"].score))

Toxicity score: 0.060067445


In [52]:
traindf = pd.read_csv("/project/Dataset/SBFv2.trn.csv")
devdf = pd.read_csv("/project/Dataset/SBFv2.dev.csv")
testdf = pd.read_csv("/project/Dataset/SBFv2.tst.csv")
fulldata = pd.concat([_trndf, _devdf, _tstdf], ignore_index=True)

NameError: name '_trndf' is not defined

In [None]:
fulldata

In [None]:
post2feats = dict()
papi_cnt = 0

for _post, _implication, _offense, _intent, _group, _category in zip(fulldata['post'], fulldata['targetStereotype'], fulldata['offensiveYN'], fulldata['intentYN'], fulldata['targetMinority'], fulldata['targetCategory']):
    post = _post
    post = re.sub(r'\bRT\b', ' ', post)
    post = re.sub(r'@\S+', ' ', post)
    post = re.sub(r'http\S+', ' ', post)
    post = re.sub(r'&.*?;', '.', post)
    if post not in post2feats.keys():
        post2feats[post] = defaultdict(list)
        post2feats[post]['toxicity'].append(p.score(post, tests=["TOXICITY"], languages=["en"])["TOXICITY"].score)
        papi_cnt += 1
        time.sleep(1.001)
        if papi_cnt % 1000 == 0:
            print(papi_cnt)
        
    if pd.isna(_implication):
        implication = None
    else:
        implication = _implication
    if pd.isna(_group):
        group = None
    else:
        group = _group
    if pd.isna(_category):
        category = None
    else:
        category = _category
    
    post2feats[post]['offense'].append(_offense)
    post2feats[post]['intent'].append(_intent)
    post2feats[post]['group'].append(group)
    post2feats[post]['category'].append(category)
    post2feats[post]['implication'].append(implication)

In [None]:
post2feats

In [None]:
pickle.dump(post2feats, open("post2feats.pkl", 'wb'))

In [None]:
post2feats = pickle.load(open("post2feats.pkl", 'rb'))

In [None]:
post2feats

In [None]:
Fulldata_left= fulldata[15181:]

In [None]:
len(post2feats)

In [None]:
len(_trndf) + len(_devdf) + len(_tstdf)

In [None]:
post2feats = dict()
papi_cnt = 0

for _post, _implication, _offense, _intent, _group, _category in zip(Fulldata_left['post'], Fulldata_left['targetStereotype'], Fulldata_left['offensiveYN'], Fulldata_left['intentYN'], Fulldata_left['targetMinority'], Fulldata_left['targetCategory']):
    post = _post
    post = re.sub(r'\bRT\b', ' ', post)
    post = re.sub(r'@\S+', ' ', post)
    post = re.sub(r'http\S+', ' ', post)
    post = re.sub(r'&.*?;', '.', post)
    if post not in post2feats.keys():
        post2feats[post] = defaultdict(list)
        post2feats[post]['toxicity'].append(p.score(post, tests=["TOXICITY"], languages=["en"])["TOXICITY"].score)
        papi_cnt += 1
        time.sleep(1.001)
        if papi_cnt % 1000 == 0:
            print(papi_cnt)
        
    if pd.isna(_implication):
        implication = None
    else:
        implication = _implication
    if pd.isna(_group):
        group = None
    else:
        group = _group
    if pd.isna(_category):
        category = None
    else:
        category = _category
    
    post2feats[post]['offense'].append(_offense)
    post2feats[post]['intent'].append(_intent)
    post2feats[post]['group'].append(group)
    post2feats[post]['category'].append(category)
    post2feats[post]['implication'].append(implication)

In [None]:
len(post2feats2)

In [None]:
pickle.dump(post2feats, open("post2feats2.pkl", 'wb'))

In [9]:
post2feats2 = pickle.load(open("post2feats2.pkl", 'rb'))

In [None]:
len(post2feats2) + len(post2feats)

In [None]:
features_dict={**post2feats2,**post2feats}

In [12]:
features_dict = pickle.load(open("features_dict.pkl", 'rb'))

In [None]:
pickle.dump(features_dict, open("features_dict.pkl", 'wb'))

In [14]:
#get microaggression
#changed cutoff from original code
micro_aggression = []
for k, v in features_dict.items():
    if np.mean(v['offense']) >= 0.5 and any([_g != None for _g in v['group']]):
        micro_aggression.append((k, v))
        
sorted_list = sorted([v['toxicity'][0] for k, v in micro_aggression])
micro_aggression_clean_cutoff = 0

running_sum = 0
for _i, v in enumerate(sorted_list):
    running_sum += v
    running_mean = running_sum / (_i + 1)
    if running_mean >= .20:
        micro_aggression_clean_cutoff = v
        print(f"Stop at index: {_i}/{len(sorted_list)}")
        break
        
print(f"Cutoff: {micro_aggression_clean_cutoff}")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Stop at index: 3881/15743
Cutoff: 0.34201986


In [36]:
clean_train = []
for k, v in features_dict.items():
    if np.mean(v['offense']) == 0:
        clean_train.append((k, v))
        
sorted_toxicity_list = sorted([v['toxicity'][0] for k, v in clean_train])
clean_train_clean_cutoff = 0

running_sum = 0
for _i, v in enumerate(sorted_toxicity_list):
    print(_i, v)
    running_sum += v
    running_mean = running_sum / (_i + 1)
    if running_mean >= .20:
        clean_train_clean_cutoff = v
        print(f"Stop at index: {_i}/{len(sorted_toxicity_list)}")
        break
        
print(f"Cutoff: {clean_train_clean_cutoff}")

0 5.567405e-06
1 3.1494008e-05
2 5.9146798e-05
3 6.200464e-05
4 7.502026e-05
5 0.0001259149
6 0.00018482625
7 0.00022737747
8 0.00025055165
9 0.00026385256
10 0.00028966874
11 0.00030916728
12 0.0003149282
13 0.00032616107
14 0.0003390749
15 0.000374268
16 0.00037670016
17 0.00042051842
18 0.00042105696
19 0.00063197885
20 0.0006591145
21 0.00070315436
22 0.0007218379
23 0.0008275631
24 0.0009119357
25 0.0009171256
26 0.0009301744
27 0.000932547
28 0.0009414439
29 0.0009670967
30 0.0011118202
31 0.0011445906
32 0.0012120589
33 0.0012503158
34 0.0013438818
35 0.0014515348
36 0.0014823775
37 0.0018225369
38 0.0018360306
39 0.0020151555
40 0.0020256836
41 0.0022012498
42 0.0022694597
43 0.002284881
44 0.0023430076
45 0.0023443422
46 0.0024413187
47 0.002657959
48 0.0027712467
49 0.0029522993
50 0.002955858
51 0.0030074602
52 0.0030890154
53 0.0032828203
54 0.0035483937
55 0.0035753811
56 0.0036692438
57 0.0036996417
58 0.0037024592
59 0.0037494644
60 0.0038846978
61 0.0043341415
62 0.0044

1237 0.054898046
1238 0.054907553
1239 0.05493816
1240 0.05494025
1241 0.054942027
1242 0.054946277
1243 0.05495076
1244 0.054955475
1245 0.05498477
1246 0.0549857
1247 0.05499443
1248 0.05502133
1249 0.055023573
1250 0.05502543
1251 0.05503625
1252 0.055041194
1253 0.05504931
1254 0.055055495
1255 0.05508216
1256 0.055095457
1257 0.055120345
1258 0.055125136
1259 0.055185117
1260 0.05520197
1261 0.05521735
1262 0.055266123
1263 0.05526628
1264 0.05527161
1265 0.05527184
1266 0.05530075
1267 0.055305775
1268 0.055314973
1269 0.055315822
1270 0.055317756
1271 0.055341024
1272 0.055365987
1273 0.055373795
1274 0.05538593
1275 0.05538856
1276 0.055394974
1277 0.055413216
1278 0.055413987
1279 0.05542473
1280 0.055431224
1281 0.05543996
1282 0.05546477
1283 0.05548904
1284 0.05550999
1285 0.05554531
1286 0.055545468
1287 0.0555542
1288 0.055554897
1289 0.055558376
1290 0.055586047
1291 0.055591457
1292 0.05560112
1293 0.055612173
1294 0.055613797
1295 0.055619285
1296 0.055619977
1297 0.05

2480 0.07075523
2481 0.07079666
2482 0.07083685
2483 0.07084381
2484 0.0709017
2485 0.07090518
2486 0.070915155
2487 0.070940584
2488 0.07094638
2489 0.070967786
2490 0.07097985
2491 0.07100203
2492 0.07102738
2493 0.07102893
2494 0.0710362
2495 0.071052425
2496 0.07106302
2497 0.071076855
2498 0.071079865
2499 0.071083575
2500 0.07110135
2501 0.071127094
2502 0.07114047
2503 0.07114835
2504 0.071154535
2505 0.07116883
2506 0.07116899
2507 0.07117309
2508 0.071199134
2509 0.0712384
2510 0.07123871
2511 0.07124149
2512 0.07125927
2513 0.071268156
2514 0.07131113
2515 0.07131129
2516 0.07132806
2517 0.0713497
2518 0.071362995
2519 0.071392216
2520 0.07139577
2521 0.07140319
2522 0.07142267
2523 0.07142816
2524 0.07147569
2525 0.07149007
2526 0.071492314
2527 0.07152903
2528 0.07152903
2529 0.071592174
2530 0.071599446
2531 0.07163732
2532 0.07164211
2533 0.07164729
2534 0.07164953
2535 0.071655326
2536 0.071667075
2537 0.07167651
2538 0.07167766
2539 0.07168478
2540 0.0716939
2541 0.0716

3713 0.08827897
3714 0.08830006
3715 0.08831868
3716 0.08833217
3717 0.088341825
3718 0.08836229
3719 0.08841884
3720 0.08844739
3721 0.08846834
3722 0.08849374
3723 0.088498466
3724 0.08849984
3725 0.088511616
3726 0.08851189
3727 0.08852716
3728 0.08854085
3729 0.08854414
3730 0.08856591
3731 0.08858357
3732 0.088623144
3733 0.088624924
3734 0.08862903
3735 0.08864375
3736 0.08868141
3737 0.08868682
3738 0.088702016
3739 0.08870893
3740 0.08872598
3741 0.08872735
3742 0.08876884
3743 0.08880225
3744 0.088803686
3745 0.08881717
3746 0.08884442
3747 0.088864416
3748 0.08886674
3749 0.08887831
3750 0.08888454
3751 0.08889645
3752 0.0889311
3753 0.08893863
3754 0.088955745
3755 0.088968
3756 0.088977315
3757 0.08898204
3758 0.08898964
3759 0.088990115
3760 0.08899806
3761 0.08901455
3762 0.089028046
3763 0.08902893
3764 0.08903544
3765 0.08904228
3766 0.0890657
3767 0.089094594
3768 0.08911006
3769 0.08913985
3770 0.089148335
3771 0.08914895
3772 0.08915217
3773 0.08919955
3774 0.0892160

4919 0.1061471
4920 0.10614883
4921 0.10615282
4922 0.10617873
4923 0.10620193
4924 0.106215954
4925 0.10622975
4926 0.1062376
4927 0.1062432
4928 0.10626034
4929 0.106267095
4930 0.10627252
4931 0.106287405
4932 0.106303625
4933 0.10630484
4934 0.10630986
4935 0.10634223
4936 0.10635078
4937 0.106366076
4938 0.10637063
4939 0.10640803
4940 0.10645697
4941 0.10646182
4942 0.10647988
4943 0.10648029
4944 0.10649547
4945 0.106501065
4946 0.106512494
4947 0.10652323
4948 0.10654349
4949 0.106544584
4950 0.10654649
4951 0.10657766
4952 0.10657863
4953 0.10658008
4954 0.10660345
4955 0.10661511
4956 0.10663404
4957 0.10664039
4958 0.10665926
4959 0.10666475
4960 0.10668541
4961 0.106691524
4962 0.106698565
4963 0.106700815
4964 0.106712304
4965 0.106717266
4966 0.10673118
4967 0.10674093
4968 0.10674699
4969 0.10675219
4970 0.10675749
4971 0.10676488
4972 0.10680315
4973 0.106822655
4974 0.1068402
4975 0.10686923
4976 0.10688654
4977 0.10689007
4978 0.10689924
4979 0.106906
4980 0.1069644
4

6187 0.13966441
6188 0.1397274
6189 0.1397476
6190 0.13985418
6191 0.13986082
6192 0.13991407
6193 0.13992117
6194 0.1399847
6195 0.13999325
6196 0.14000718
6197 0.14004122
6198 0.14015162
6199 0.14018475
6200 0.14031518
6201 0.14040546
6202 0.14043732
6203 0.14044906
6204 0.1404761
6205 0.140481
6206 0.14051504
6207 0.14057784
6208 0.14058056
6209 0.14062199
6210 0.14067805
6211 0.14068124
6212 0.14069088
6213 0.14070636
6214 0.14073175
6215 0.14075696
6216 0.14077052
6217 0.14080429
6218 0.14081894
6219 0.14082767
6220 0.1408345
6221 0.14083733
6222 0.14098194
6223 0.14102118
6224 0.14103383
6225 0.14109835
6226 0.14115496
6227 0.14122304
6228 0.14134355
6229 0.14135201
6230 0.14136757
6231 0.14144766
6232 0.14153168
6233 0.14153323
6234 0.14155397
6235 0.14158629
6236 0.14161186
6237 0.14161304
6238 0.14164454
6239 0.14165945
6240 0.1416723
6241 0.14185405
6242 0.14186679
6243 0.1418698
6244 0.14187098
6245 0.14193359
6246 0.14197655
6247 0.14197774
6248 0.14201696
6249 0.14205237
6

7490 0.17446391
7491 0.17448945
7492 0.1744979
7493 0.17457157
7494 0.1745776
7495 0.17459333
7496 0.17460026
7497 0.17466207
7498 0.17473911
7499 0.17474133
7500 0.17484283
7501 0.17498219
7502 0.17498584
7503 0.17510444
7504 0.17520753
7505 0.17522798
7506 0.17525317
7507 0.17526561
7508 0.17528357
7509 0.1753267
7510 0.17540467
7511 0.17544065
7512 0.175536
7513 0.17558165
7514 0.17561148
7515 0.1756971
7516 0.1757313
7517 0.17577136
7518 0.17579746
7519 0.17579968
7520 0.17584154
7521 0.17589182
7522 0.1759321
7523 0.17607753
7524 0.17607774
7525 0.17617996
7526 0.17619641
7527 0.17629714
7528 0.17634785
7529 0.1764126
7530 0.1764375
7531 0.17646797
7532 0.176588
7533 0.17667456
7534 0.1769044
7535 0.17704138
7536 0.17708266
7537 0.1770891
7538 0.17712422
7539 0.17715369
7540 0.17725956
7541 0.17729877
7542 0.1773002
7543 0.17733282
7544 0.1773364
7545 0.17736694
7546 0.17747968
7547 0.17753176
7548 0.17755723
7549 0.17776975
7550 0.17777039
7551 0.1777877
7552 0.17781568
7553 0.17

8733 0.2493636
8734 0.24938245
8735 0.24938679
8736 0.2494883
8737 0.24949779
8738 0.24950047
8739 0.24954155
8740 0.24955186
8741 0.24959396
8742 0.24963383
8743 0.2499066
8744 0.25000635
8745 0.25026098
8746 0.2503351
8747 0.2503739
8748 0.25044248
8749 0.25045148
8750 0.2505198
8751 0.2505789
8752 0.25063744
8753 0.25075722
8754 0.25086838
8755 0.2510763
8756 0.2511867
8757 0.25125566
8758 0.25125718
8759 0.25128508
8760 0.25130528
8761 0.25138196
8762 0.2515062
8763 0.25153327
8764 0.2516566
8765 0.25169072
8766 0.25169173
8767 0.25182167
8768 0.25209004
8769 0.2522401
8770 0.2522694
8771 0.2523232
8772 0.25241855
8773 0.2526828
8774 0.25275558
8775 0.25310493
8776 0.25342387
8777 0.25343427
8778 0.25368747
8779 0.25396848
8780 0.25404432
8781 0.25406018
8782 0.25414693
8783 0.25423044
8784 0.2542415
8785 0.25430596
8786 0.2543415
8787 0.25462422
8788 0.2546511
8789 0.25466472
8790 0.25473988
8791 0.2547442
8792 0.25476292
8793 0.2548667
8794 0.25497442
8795 0.2550624
8796 0.255078

10001 0.38559288
10002 0.38601673
10003 0.38623366
10004 0.3863971
10005 0.38641426
10006 0.38673386
10007 0.38681775
10008 0.3868384
10009 0.3869647
10010 0.38712913
10011 0.38719726
10012 0.38727894
10013 0.3872953
10014 0.387434
10015 0.38748637
10016 0.38767642
10017 0.38842177
10018 0.38858315
10019 0.388629
10020 0.38878608
10021 0.38910335
10022 0.38917008
10023 0.38931364
10024 0.3896144
10025 0.39049807
10026 0.39063936
10027 0.3908283
10028 0.39110407
10029 0.39134192
10030 0.39159724
10031 0.39195278
10032 0.39224344
10033 0.39226773
10034 0.3928294
10035 0.39304337
10036 0.39328122
10037 0.39369786
10038 0.39381453
10039 0.39385718
10040 0.39450568
10041 0.39475757
10042 0.39486113
10043 0.39486602
10044 0.39498967
10045 0.39507276
10046 0.39612898
10047 0.39658698
10048 0.3969222
10049 0.39711025
10050 0.39724293
10051 0.39725906
10052 0.39783818
10053 0.39806002
10054 0.3980768
10055 0.398095
10056 0.39826685
10057 0.3986303
10058 0.3986906
10059 0.3993713
10060 0.4002762

11152 0.6364068
11153 0.6364068
11154 0.6364068
11155 0.6364068
11156 0.6364068
11157 0.6364068
11158 0.6364068
11159 0.6364068
11160 0.6364068
11161 0.6364068
11162 0.6364068
11163 0.6364068
11164 0.6364068
11165 0.6364068
11166 0.6364068
11167 0.6364068
11168 0.6364068
11169 0.6364068
11170 0.6374533
11171 0.6374827
11172 0.63753724
11173 0.6379078
11174 0.6386472
11175 0.6400164
11176 0.64007014
11177 0.64259243
11178 0.6427806
11179 0.6429574
11180 0.6434142
11181 0.64450943
11182 0.644721
11183 0.6473853
11184 0.6474502
11185 0.6474633
11186 0.6481017
11187 0.6485219
11188 0.6492802
11189 0.6504485
11190 0.65053934
11191 0.65084213
11192 0.65097755
11193 0.65177774
11194 0.6538313
11195 0.6545607
11196 0.65475917
11197 0.65476966
11198 0.654776
11199 0.6548287
11200 0.6553451
11201 0.6553959
11202 0.6554943
11203 0.6555956
11204 0.6559162
11205 0.6559807
11206 0.6564186
11207 0.65649736
11208 0.6565675
11209 0.65679944
11210 0.6568523
11211 0.6570306
11212 0.65728587
11213 0.65815

# Building Dataset

In [103]:
random.seed(2020)
micro_set = []

for k, v in features_dict.items():
    if len(v['toxicity']) > 0 and v['toxicity'][0] <= micro_aggression_clean_cutoff and np.mean(v['offense']) >= 0.5 and any([_g != None for _g in v['group']]):
        micro_set.append((k, tuple(v['group'])))
        
print(len(micro_set))
        
micro_score_list = []
for k, v in micro_set:
    micro_score_list.append(features_dict[k]['toxicity'][0])
        
print(np.mean(micro_score_list))

3882
0.20000061591245363


In [25]:
random.seed(2020)
nonmicro_control_set = []

for k, v in features_dict.items():
    if len(v['toxicity']) > 0 and v['toxicity'][0] <= clean_train_clean_cutoff and np.mean(v['offense']) == 0 and v['toxicity'][0] < 0.5: # post check to make sure none is toxic
        nonmicro_control_set.append((k, tuple(v['group'])))
        
print(len(nonmicro_control_set))

nonmicro_control_score_list = []
for k, v in nonmicro_control_set:
    nonmicro_control_score_list.append(features_dict[k]['toxicity'][0])

print(np.mean(nonmicro_control_score_list))

10690
0.1543766123425988


In [117]:
true_general_toxicity_dict = pickle.load(open("/project/toxicity/Resources /true_general_matched_toxicity_dict.pkl", 'rb'))

In [118]:
true_general_toxicity_dict

{"Reddit est quand m√™me un des sites les plus utilis√©s au monde, et non pas un ptit forum sombre perdu dans les abymes de l'Internet.": 0.13116808,
 "It's a little confusing but it's not at the same time.": 0.02028659,
 "SIKE there's sausage and sour cream up in this bitch.": 0.97408956,
 'Well I‚Äôm just ok and I‚Äôm on PS4.': 0.023758141,
 'thanks uwu': 0.03016074,
 'Dude, this looks amazing!': 0.026874341,
 'We have a lot of money rolling off before those big contracts.': 0.07902599,
 'Thats perfect.': 0.051658604,
 "Sure, it's just that their role is nowhere near what you're portraying it as.": 0.06774383,
 'Thats why': 0.030505471,
 'Did you know TopMindsOfReddit has a discord?': 0.066603914,
 'I live in Indianapolis.': 0.03312206,
 'First of all that laptop runs an integrated graphics card.': 0.027267735,
 'Got a membership today.': 0.028582675,
 'No, not at all.': 0.051203173,
 'I asked today and once they are out.': 0.057128157,
 "But he didn't need help paying it, he just di

In [120]:
#toxic set
arbitrary_toxic_cutoff = 0.8

toxic_set = []

for k, v in features_dict.items():
    if len(v['toxicity']) > 0 and v['toxicity'][0] > arbitrary_toxic_cutoff and np.mean(v['offense']) > 0.5 and any([_g != None for _g in v['group']]):
        toxic_set.append((k, tuple(v['group'])))
        
print(len(toxic_set))

3779


In [32]:
#clean set
extended_true_general_toxicity_dict = pickle.load(open("/project/toxicity/Resources /large_true_general_matched_toxicity_dict.pkl", 'rb'))

In [33]:
#clean set 
random.seed(2020)
clean_set = [(k, None) for k, v in extended_true_general_toxicity_dict.items() if v < 0.5]

print(len(clean_set))

ext_general_clean_score_list = []
for k, v in clean_set:
    ext_general_clean_score_list.append(extended_true_general_toxicity_dict[k])

print(np.mean(ext_general_clean_score_list))

25014
0.117418342948237


In [34]:
clean_set

[("Reddit est quand m√™me un des sites les plus utilis√©s au monde, et non pas un ptit forum sombre perdu dans les abymes de l'Internet.",
  None),
 ("It's a little confusing but it's not at the same time.", None),
 ('Well I‚Äôm just ok and I‚Äôm on PS4.', None),
 ('thanks uwu', None),
 ('Dude, this looks amazing!', None),
 ('We have a lot of money rolling off before those big contracts.', None),
 ('Thats perfect.', None),
 ("Sure, it's just that their role is nowhere near what you're portraying it as.",
  None),
 ('Thats why', None),
 ('Did you know TopMindsOfReddit has a discord?', None),
 ('I live in Indianapolis.', None),
 ('First of all that laptop runs an integrated graphics card.', None),
 ('Got a membership today.', None),
 ('No, not at all.', None),
 ('I asked today and once they are out.', None),
 ("But he didn't need help paying it, he just didn't want to pay it and lose his fun money for a while.",
  None),
 ('When you do things right, people won‚Äôt be sure you did anythin

In [127]:
#number for sets
ma_train_size = 2500
ma_adv_size = 100
ma_test_size = 1000
clean_train_size = 8000
clean_test_size = 1000
toxic_train_size = 2500
toxic_test_size = 1000

In [125]:
random.seed(2020)
ma_train = random.sample(micro_set, ma_train_size)
ma_adv = random.sample([e for e in micro_set if e not in ma_train], ma_adv_size)
ma_test = random.sample([e for e in micro_set if e not in ma_train and e not in ma_adv], ma_test_size)
print(len(ma_train), len(ma_adv), len(ma_test))


2500 100 1000


In [126]:
random.seed(2020)
clean_train = random.sample(clean_set, clean_train_size)
clean_test = random.sample([e for e in clean_set if e not in clean_train], clean_test_size)
print(len(clean_train), len(clean_test))

8000 1000


In [129]:
random.seed(2020)
toxic_train = random.sample(toxic_set, toxic_train_size)
toxic_test = random.sample([e for e in toxic_set if e not in toxic_train], toxic_test_size)
print(len(toxic_train), len(toxic_test)) 

2500 1000


In [130]:
random.seed(2020)
nonma_control_train = random.sample(nonmicro_control_set, ma_train_size) 
nonma_control_adv = random.sample([e for e in nonmicro_control_set if e not in nonma_control_train], ma_adv_size)
nonma_control_test = random.sample([e for e in nonmicro_control_set if e not in nonma_control_train and e not in nonma_control_adv], ma_test_size)
print(len(nonma_control_train), len(nonma_control_adv), len(nonma_control_test))


2500 100 1000


In [132]:
random.seed(2020)
nonma_large_train = random.sample(nonmicro_control_set, clean_train_size)
nonma_large_test = random.sample([e for e in nonmicro_control_set if e not in nonma_large_train], clean_test_size)
print(len(nonma_large_train), len(nonma_large_test))

8000 1000


In [135]:
pickle.dump(ma_train, open("/project/my_data/ma_train.pkl", 'wb'))
pickle.dump(ma_adv, open("/project/my_data/ma_adv.pkl", 'wb'))
pickle.dump(ma_test, open("/project/my_data/ma_test.pkl", 'wb'))
pickle.dump(clean_train, open("/project/my_data/clean_train.pkl", 'wb'))
pickle.dump(clean_test, open("/project/my_data/clean_test.pkl", 'wb'))
pickle.dump(toxic_train, open("/project/my_data/toxic_train.pkl", 'wb'))
pickle.dump(toxic_test, open("/project/my_data/toxic_test.pkl", 'wb'))

In [136]:
pickle.dump(nonma_control_train, open("/project/my_data/nonma_control_train.pkl", 'wb'))
pickle.dump(nonma_control_adv, open("/project/my_data/nonma_control_adv.pkl", 'wb'))
pickle.dump(nonma_control_test, open("/project/my_data/nonma_control_test.pkl", 'wb'))
pickle.dump(nonma_large_train, open("/project/my_data/nonma_large_train.pkl", 'wb'))
pickle.dump(nonma_large_test, open("/project/my_data/nonma_large_test.pkl", 'wb'))

In [7]:
data_dir = "/project/my_data/"
ma_train = pickle.load(open(data_dir + "ma_train.pkl", 'rb'))

In [8]:
ma_train

[("Reasons I'm glad I am not Liberal--I don't have to pretend that:\n\n1.Oprah is inspiring\n2.Chelsea is smart/pretty\n3.Hillary even once by accident did a good job at anything\n4.Hollywood/media is correct about anything\n5.BJ Clinton is a Statesman (Ditto: Obama/W/Carter)\n6.There are more than 2 sexes.\n\n",
  ('trans women, trans men', 'trans women, trans men')),
 ('I always like to say my first truly heretical racial thought was riding my bike across the South Side of Chicago in 1992 and thinking "my God this looks just like Africa!"',
  ('black folks',
   'black folks',
   'black folks',
   'black folks',
   'black folks',
   'black folks')),
 ('I love oxymorons Like affordable housing, and African food.',
  ('Africans', 'Africans')),
 ('Hopefully they will get deportation orders and be on the next flight back to Cairo .',
  ('Egypt', 'Egypt')),
 ('What do a Taliban outpost and a Pakistani elementary school have in common? I dunno, I just fly the drone.',
  ('Pakistan',
   'Pak