In [27]:
import numpy as np
import re
from scipy.stats import ttest_ind

In [168]:
def make_array(string):
    new = []
    for item in string.split(' '):
        interim = re.sub("[^0-9]", "", item)
        if interim != '':
            new.append(int(interim))
    return np.array([new[:2],new[2:]])

def array_fromgroup(readings, side):
    keys = list(map(subs.get, side))
    array = np.array([[0,0],[0,0]])
    array[0,0] = sum([readings[0]['cor'][item] for item in keys])
    array[0,1] = sum([readings[0]['inc'][item] for item in keys])
    array[1,0] = sum([readings[1]['cor'][item] for item in keys])
    array[1,1] = sum([readings[1]['inc'][item] for item in keys])
    return array

def cohend(d1, d2):
	# calculate the size of samples
	n1, n2 = len(d1), len(d2)
	# calculate the variance of the samples
	s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
	# calculate the pooled standard deviation
	s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
	# calculate the means of the samples
	u1, u2 = np.mean(d1), np.mean(d2)
	# calculate the effect size
	return (u1 - u2) / s

In [159]:
def return_F1(arr):
    return arr[0,0] / (arr[0,0] +(arr[1,0] + arr[0,1])/2)

def return_precision(arr):
    return arr[0,0] / (arr[0,0] + arr[1,0])

def return_recall(arr):
    return arr[0,0] / (arr[0,0] + arr[0,1])

def return_accuracy(arr):
    return (arr[0,0] + arr[1,1]) / arr.sum()

def return_baseline(arr):
    q = (arr[0,0]+ arr[0,1]) / arr.sum()
    print(2*q/(q+1))

In [250]:
# 3 pre-train epochs
string = """[[313 267]
 [377 358]]"""

print(return_F1(make_array(string)))
print(return_precision(make_array(string)))
print(return_recall(make_array(string)))

0.49291338582677163
0.45362318840579713
0.5396551724137931


In [62]:
# 5 pre-train epochs
string = """[[ 73825  73831]
 [489195 599071]]"""

print(return_F1(make_array(string)))
print(return_precision(make_array(string)))
print(return_recall(make_array(string)))

0.20775993561060174
0.13112322830450074
0.4999796825052826


In [262]:
string = """[[ 73825  73831]
 [489195 599071]]"""
arr = make_array(string)

In [263]:
P = arr[0,0] + arr[0,1]
T = arr.sum()
N = arr[1,0] + arr[1,1]

In [265]:
P

147656

In [264]:
T/2

617961.0

In [266]:
P/2

73828.0

In [267]:
ex = np.array([[P/2,P/2],[N/2,N/2]])

In [268]:
print(return_precision(ex))
print(return_recall(ex))

0.11947032256080885
0.5


In [254]:
# ratio, get baseline F1 score (random guess) --> if majority choice, baseline F1 is 0
#q = (make_array(string)[0,0]+ make_array(string)[0,1]) / make_array(string).sum()
q = 0.18
2*q/(q+1)

0.3050847457627119

In [49]:
# baseline accuracy (majority choice) --> if random guess, baseline accuracy is 0.5
1-q

0.8805296774391912

In [None]:
left = ['Liberal', 'SocialDemocracy', 'socialism', 'alltheleft', 'neoliberal', 'democrats']
right = ['Libertarian', 'Conservative', 'Republican']
alt_right = ['The_Donald']

In [114]:
subs =  {'politics': 0, 'The_Donald': 1, 'news': 2, 'neoliberal': 3, 'unitedkingdom': 4, 'canada': 5, 'europe': 6, 'australia': 7, 'india': 8, 'Libertarian': 9, 'Conservative': 10, 'ireland': 11, 'newzealand': 12, 'teenagers': 13, 'democrats': 14, 'socialism': 15, 'MensRights': 16, 'TwoXChromosomes': 17, 'Republican': 18, 'Liberal': 19, 'uspolitics': 20, 'SocialDemocracy': 21, 'alltheleft': 22, 'feminisms': 23}
readings = {'cor': {0: 40980, 1: 14666, 2: 2877, 3: 2939, 4: 2963, 5: 2233, 6: 2222, 7: 1916, 8: 1020, 9: 357, 10: 1349, 11: 964, 12: 1205, 13: 569, 14: 265, 15: 87, 16: 253, 17: 171, 18: 63, 19: 54, 20: 27, 21: 4, 22: 7, 23: 0}, 'inc': {0: 28415, 1: 10867, 2: 3586, 3: 2030, 4: 7017, 5: 3796, 6: 4466, 7: 2502, 8: 1766, 9: 431, 10: 971, 11: 1310, 12: 2063, 13: 330, 14: 150, 15: 128, 16: 324, 17: 182, 18: 45, 19: 42, 20: 28, 21: 10, 22: 3, 23: 3}}, {'cor': {0: 311040, 1: 77122, 2: 27455, 3: 20235, 4: 23528, 5: 24733, 6: 20422, 7: 18481, 8: 10213, 9: 9946, 10: 7903, 11: 5927, 12: 4748, 13: 2734, 14: 1419, 15: 2397, 16: 1059, 17: 928, 18: 412, 19: 234, 20: 224, 21: 125, 22: 27, 23: 5}, 'inc': {0: 339272, 1: 70961, 2: 14549, 3: 22727, 4: 8840, 5: 11205, 6: 6596, 7: 9712, 8: 4937, 9: 7059, 10: 6531, 11: 3484, 12: 2222, 13: 3518, 14: 2055, 15: 1135, 16: 565, 17: 534, 18: 348, 19: 262, 20: 242, 21: 152, 22: 40, 23: 3}}

In [160]:
print(return_F1(array_fromgroup(readings,left)))
print(return_precision(array_fromgroup(readings,left)))
print(return_recall(array_fromgroup(readings,left)))
print(return_accuracy(array_fromgroup(readings,left)))
return_baseline(array_fromgroup(readings,left))

0.20028646455001192
0.12074982909365668
0.5868158769015562
0.5258902825198578
0.1837547794235774


In [161]:
print(return_F1(array_fromgroup(readings,right)))
print(return_precision(array_fromgroup(readings,right)))
print(return_recall(array_fromgroup(readings,right)))
print(return_accuracy(array_fromgroup(readings,right)))
return_baseline(array_fromgroup(readings,right))

0.15219822765206917
0.08831752371442836
0.5500621890547264
0.44351263588874773
0.16649840801428903


In [162]:
print(return_F1(array_fromgroup(readings,alt_right)))
print(return_precision(array_fromgroup(readings,alt_right)))
print(return_recall(array_fromgroup(readings,alt_right)))
print(return_accuracy(array_fromgroup(readings,alt_right)))
return_baseline(array_fromgroup(readings,alt_right))

0.2500149163406381
0.1597812350198283
0.5743939215916657
0.4931976315546954
0.25642107165991296


In [141]:
results = {1337: 
           {'LEFT' :
            { 'F1' : 0.1691500874053109,
            'precision': 0.10052770448548813,
            'recall': 0.5329603077461095,
            'accuracy':0.47028853468254106
            },
            'RIGHT' :
            { 'F1' : 0.12877215391984673,
            'precision': 0.07386884044696831,
            'recall': 0.5015547263681592,
            'accuracy':0.38370746858675703
            },
             'ALT' :
            { 'F1' : 0.21937394177838487,
            'precision': 0.13864190893676853,
            'recall': 0.5252026788861474,
            'accuracy':0.4502983595981937
            }
             }
          }

In [142]:
string = """
42
LEFT
0.20239053324987333
0.1219906575637801
0.5936352509179926
0.526615599624958
RIGHT
0.14404479131319986
0.08339882121807465
0.5279850746268657
0.43018494987999434
ALT
0.2511975457809541
0.1607761835812382
0.574041436572279
0.4966880932632937

97
LEFT
0.2079340046099721
0.12578431732286355
0.599405490470362
0.5379906947122614
RIGHT
0.1587217514124294
0.09248971193415638
0.5590796019900498
0.46180996752788367
ALT
0.27637767007718544
0.17927043673664234
0.6030235381662946
0.5356073172979449

9
LEFT
0.2172190312375219
0.13175348386615993
0.6182899108235705
0.5491535018663647
RIGHT
0.16329248801278637
0.09524549409571162
0.5718283582089553
0.4678526048284625
ALT
0.2758755001593428
0.17822053962553328
0.6102690635647985
0.528845267717261

7
LEFT
0.20028646455001192
0.12074982909365668
0.5868158769015562
0.5258902825198578
RIGHT
0.15219822765206917
0.08831752371442836
0.5500621890547264
0.44351263588874773
ALT
0.2500149163406381
0.1597812350198283
0.5743939215916657
0.4931976315546954
}"""

In [145]:
lines = string.split('\n')

lines.remove('')
lines.remove('')
lines.remove('')
lines.remove('')
groups = [lines[:16],lines[16:32],lines[32:48],lines[48:-1]]
for group in groups:
    results[int(group[0])] = {}
    sides = [group[1:6], group[6:11], group[11:]]
    for side in sides:
        results[int(group[0])][side[0]] = {
                    'F1' : float(side[1]),
                    'precision': float(side[2]),
                    'recall': float(side[3]),
                    'accuracy': float(side[4])
        }
    

In [164]:
for seed in results.keys():
    results[seed]['LEFT']['d_F1'] = results[seed]['LEFT']['F1'] - 0.1837547794235774
    results[seed]['RIGHT']['d_F1'] = results[seed]['RIGHT']['F1'] - 0.16649840801428903
    results[seed]['ALT']['d_F1'] = results[seed]['ALT']['F1'] - 0.25642107165991296

In [171]:
def print_analysis(measure):
    print(measure)
    l = []
    r = []
    a = []
    for seed in results.keys():
        l.append(results[seed]['LEFT'][measure])
        r.append(results[seed]['RIGHT'][measure])
        a.append(results[seed]['ALT'][measure])

    for values in [l,r,a]:
        print(np.mean(values), np.std(values))

    print("left, right", ttest_ind(l,r), cohend(l,r))
    print("left, alt", ttest_ind(l,a), cohend(l,a))
    print("right, alt", ttest_ind(r,a), cohend(r,a))

In [172]:
print_analysis('F1')
print()
print_analysis('precision')
print()
print_analysis('recall')
print()
print_analysis('accuracy')
print()
print_analysis('d_F1')

F1
0.19939602421053806 0.01621809699314721
0.1494058824620663 0.01218123533589448
0.25456791482730107 0.020978005331480247
left, right Ttest_indResult(statistic=4.929210705744553, pvalue=0.0011505821681812183) 3.1175065794077628
left, alt Ttest_indResult(statistic=-4.161388570386083, pvalue=0.0031591908567275818) -2.6318932222823883
right, alt Ttest_indResult(statistic=-8.670233368402927, pvalue=2.4362569888832412e-05) -5.483537057869404

precision
0.12016119846638969 0.010537248587427717
0.08666407828186787 0.007545761745026351
0.16333806078000215 0.014862320951622878
left, right Ttest_indResult(statistic=5.1691466677341635, pvalue=0.0008540291054577431) 3.269255405901913
left, alt Ttest_indResult(statistic=-4.739831790975053, pvalue=0.0014642728630875246) -2.9977328371112586
right, alt Ttest_indResult(statistic=-9.200061082051114, pvalue=1.5761021524632913e-05) -5.8186295263909535

recall
0.5862213673719182 0.02861563266560769
0.5421019900497512 0.024813822488492853
0.577386127756237

In [None]:
"""
1337- 

3 epochs-56.4% accuracy
0.2073090288002827
0.13245055540503928
0.47677033103971395

5 epochs-51.322% accuracy
0.21086334898606993
0.13075624182559978
0.5443666359646746

42-

3-53.02%
0.2100344914383678
0.13141974667508927
0.5227285040905889

5-51.402%
0.21096757073457817
0.13086852355884415
0.5438112911090643

97-

3-50.298%
0.2102938302839571
0.12978254798415761
0.553922630980116

5-56.724%
0.20588244027734473
0.1318452709204106
0.46956439291325786

9-

3-50.268%
0.2126852862361724
0.13114743739613413
0.562252803814271

5-54.45%
0.20775993561060174
0.13112322830450074
0.4999796825052826


7-

3-52.474%
0.20811921336863504
0.1299205574443734
0.522775911578263

5-62.182%
0.20029189486993912
0.13399923534973593
0.3964010944357155

"""

In [261]:
data = [0.5443666359646746,0.5438112911090643,0.46956439291325786,0.4999796825052826,0.3964010944357155]
print(np.mean(data), np.std(data))

0.4908246193855989 0.05500172614386969


In [257]:
accs = [51.322, 51.402, 56.724, 54.45, 62.182]
np.mean(accs), np.std(accs)

(55.21600000000001, 4.0273020249293445)

In [166]:
np.mean([0.20811921336863504,0.2126852862361724, 0.2102938302839571, 0.2100344914383678, 0.2073090288002827]), np.std([0.20811921336863504,0.2126852862361724, 0.2102938302839571, 0.2100344914383678, 0.2073090288002827])

(0.20968837002548302, 0.0018757081974875326)

In [167]:
accs = [52.474, 50.268, 50.298, 53.02, 56.4]
np.mean(accs), np.std(accs)

(52.492, 2.249950399453285)

In [173]:
import json

In [180]:
data = []
for line in open("Downloads/microaggressions_v1.json", "r"):
    data.append(json.loads(line))


In [222]:
import unicodedata
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"\byou\b", "name", s)
    s = re.sub("your", "name s", s)
    s = re.sub("\n", " ", s)
    s = re.sub(r"([.!?])", r" \1 ", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(" re ", " s ", s)
    s = re.sub(" ve ", " s ", s)
    #s = re.sub(r"[\d]+", "NUM", s)
    return s

In [244]:
subs = pd.read_csv("sex_subwords.txt",sep=":",names=['word','translation'],index_col=0)['translation'].to_dict()

In [246]:
subs

{'congresswoman': 'congressperson',
 'congressman': 'congressperson',
 'congresswomen': 'congresspeople',
 'congressmen': 'congresspeople',
 'women': 'people',
 'men': 'people',
 'woman': 'person',
 'man': 'person',
 'she': 'they',
 'her': 'them',
 'hers': 'theirs',
 'he': 'they',
 'his': 'theirs',
 'him': 'them',
 'mrs.': 'title',
 'ms': 'title',
 'mr.': 'title',
 'mr': 'title',
 'mrs': ' title',
 'miss': 'title',
 'madam': 'title',
 'mam': 'title',
 "ma'am": 'title',
 'sir': 'title',
 'girl': 'child',
 'boy': 'child',
 'girls': 'children',
 'boys': 'children',
 'male': 'gender',
 'female': 'gender',
 'ladies': 'people',
 'lady': 'person',
 'gentleman': 'person',
 'gentlemen': 'people',
 'guy': 'person',
 'lad': 'person',
 'fella': 'person',
 'fellow': 'person',
 'bloke': 'person',
 'fellows': 'people',
 'blokes': 'people',
 'fellas': 'people',
 'lads': 'people',
 'gal': 'person',
 'gurl': 'person',
 'gurls': 'people',
 'guys': 'people',
 'gals': 'people',
 'actor': 'actor',
 'actress

In [245]:
labels = []
texts = []
for pt in data:
    if pt['quote'] is not None:
        texts.append(normalizeString(pt['quote']))
        if 'gender' in pt['tags']:
            labels.append('female')
        else:
            labels.append('male')

In [247]:
new_texts = []
for text in texts:
    interim = []
    for word in text.split(' '):
        if word in subs.keys():
            interim.append(subs[word])
        else:
            interim.append(word)
    new_texts.append(' '.join(interim))

In [249]:
import pandas as pd
pd.concat([pd.Series(texts),pd.Series(labels)], axis=1).to_csv("microaggressions.tsv", sep="\t", header=False)

In [237]:
for pt in data:
    if pt['quote'] is not None:
        print(pt)

{'id': 168667240562, 'type': 'quote', 'quote': '"I\'m probably such a racist, but a black man dressed as Santa is just wrong."', 'text': '(via microaggressions)', 'transcript': None, 'media_url': None, 'permalink': 'http://www.microaggressions.com/post/168667240562/im-probably-such-a-racist-but-a-black-man', 'tags': [], 'n_hearts': 37, 'n_comments': 0, 'time_ago': '1 month ago'}
{'id': 164858087247, 'type': 'quote', 'quote': '"Yeah, but you\'re not that kind of Native."', 'text': 'This has happened more than once.  A person complains about the lazy "Natives" downtown, suggesting they are all a bunch of \'huffers\' exploiting the system.  I respond by pointing out that I\'m Metis (I have pale skin).', 'transcript': None, 'media_url': None, 'permalink': 'http://www.microaggressions.com/post/164858087247/yeah-but-youre-not-that-kind-of-native', 'tags': ['race', 'ethnicity'], 'n_hearts': 55, 'n_comments': 0, 'time_ago': '5 months ago'}
{'id': 164821946327, 'type': 'quote', 'quote': '"CAN Y

In [255]:
lens = []
for text in texts:
    lens.append(len(text.split()))
    
np.mean(lens)

18.665399239543728