In [1]:
import os
import pandas as pd
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
try:
    from google.colab import drive
except:
    print("Could not import google colab")

  from .autonotebook import tqdm as notebook_tqdm


Could not import google colab


In [2]:
try:
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/data')
except:
    pass
nta = pd.read_json("NTA_Dataset.jsonl", lines=True)
yta = pd.read_json("YTA_Dataset.jsonl", lines=True)
    

In [3]:
def preprocess(data, remove_judgement = True):
    # in: 2d table with prompts and resolution
    # out: tidied data
    # ends with 5 hashtags
    # remove capitalisation?
    # 
    if remove_judgement:
        data['completion'] = data['completion'].str.replace(r'[N,n,Y,y][T,t][A,a]', '', regex=True)
    data['completion'] = data['completion'].str.replace(r'#', '', regex=True)
    data['completion'] = data['completion'].str.replace(r'\s+', ' ', regex=True)
    return data

In [4]:
data = pd.concat([nta,yta])
data.drop(["prompt"], axis=1, inplace=True)
data = preprocess(data)
data

Unnamed: 0,completion
0,you just pointed out a fact Protip :You can a...
1,If she can't see other people drink how the f...
2,—your brother chose to isolate himself and sc...
3,—You’re only the asshole if you don’t tell yo...
4,—They abused your favor. They should get and ...
...,...
2457,I hope this is fake because if not then you'r...
2458,"for devaluing her work, her technical skills ..."
2459,You had already thrown away a bunch of her st...
2460,I didn't even need to read. You always share ...


In [5]:
nta["completion"] = nta["completion"].str.lower()
print("nta shape:",nta.shape)
true_nta = nta.loc[nta["completion"].str.startswith((' nta',' asdfasfddefinitely nta'))]
print("true nta shape:",true_nta.shape)
false_nta = nta.loc[nta["completion"].str.startswith((' yta', ' definitely yta'))]
print("false nta shape:",false_nta.shape)
yta["completion"] = yta["completion"].str.lower()
print("yta shape:",yta.shape)
true_yta = yta.loc[yta["completion"].str.startswith((' yta',' definitely yta'))]
print("True yta shape:",true_yta.shape)
false_yta = yta.loc[yta["completion"].str.startswith((' nta', ' definitely nta'))]
print("False yta shape:",false_yta.shape)
unidentified_yta = yta.loc[~yta["completion"].str.startswith(' yta')]
print(unidentified_yta["completion"].shape)

nta shape: (5314, 2)
true nta shape: (5314, 2)
false nta shape: (0, 2)
yta shape: (2462, 2)
True yta shape: (2416, 2)
False yta shape: (26, 2)
(47,)


In [6]:
test = data.sample(200)
test

Unnamed: 0,completion
551,"Culture aside, that she can somehow bifurcate..."
1056,Your brother is going to have to talk to them...
2632,. But PLEASE back up all of your documents to...
914,"it sounds like you barely know this woman, wh..."
4701,And I cannot believe they would take a good b...
...,...
2208,". I’m sorry to tell you this, but your boyfri..."
3360,", obviously. Let him know if he'd like to for..."
3254,", this is a common problem though. Whenever I..."
3326,", she scared the shit out of your sister on p..."


In [None]:
test['reason'] = -1
causes = []
for completion in test['completion']:
    print(completion)
    try:
        causes += [int(input())]
    except:
        causes += [-1]
    
    

 Culture aside, that she can somehow bifurcate "Your homosexuality" from you is...weird. You are who you are, and coincidelly you are gay. They are one in the same. That she apparently approves of you and your friendship with her yet disapproves of an essential part of your being is..weird. And , would reasonably make it intolerable.. 


In [None]:
test['reason'] = causes

In [None]:
test

In [None]:
#test.to_json("tagged.json")
# commented out to make sure i don't save over tagged data

In [None]:
with open("tagged.json") as user_file:
    file_contents = json.load(user_file)
test = pd.DataFrame(file_contents)
print(test)
#print(file_contents)

In [39]:
from collections import Counter
counts = Counter(test["reason"])
print(counts)

Counter({1: 39, 11: 23, 6: 22, 0: 19, 3: 19, 7: 18, 2: 12, 9: 12, 4: 12, 8: 8, -1: 6, 5: 6, 10: 4})


In [84]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
categories = ["Irresponsible","Entitled","Stupid","Bigot","Miscommunication","Trauma","Betrayal","Danger","Obsession","Pride","Insecurity","Rude"]
opposites = ["Responsible", "Undemanding","Intelligent","Tolerant","Clear","Healthy Mindset","Trust","Safety","Restraint","Humility","Confidence","Polite"]
accusations_sec_person = ["You were irresponsible","You were entitled","You were stupid","You were bigoted","You miscommunicated","Your trauma caused this","You betrayed this person","You acted dangerously","You were obsessive","You were proud","You were insecure","You were rude"]
accusations_third_person = ["They were irresponsible","They were entitled","They were stupid","They were bigoted","They miscommunicated","Their trauma caused this","They betrayed you","They acted dangerously","They were obsessive","They were proud","They were insecure","They were rude"]
neutral = ["The issue was caused by irresponsiblity","The issue was caused by entitlement","The issue was caused by stupidity","The issue was caused by bigotry","The issue was caused by miscommunication","The issue was caused by trauma","The issue was caused by betrayal","The issue was caused by unsafe actions","The issue was caused by obsession","The issue was caused by pride","The issue was caused by insecurity","The issue was caused by rudeness"]
#["The issue here is irresponsibility","The issue here is entitlement","The issue here is stupidity","The issue here is bigotry","The issue here is miscommunication","The issue here is trauma","The issue here is betrayal","The issue here is a lack of safety","The issue here is pride","The issue here is obsession", "You were irresponsible", "You acted entitled", "You acted stupid", "Your behaviour was bigoted", "You miscommunicated", "You acted this way because of trauma", "You betrayed someone", "Your behaviour was dangerous", "You were obsessive", "Your pride caused this issue"]
sentences = categories + opposites + accusations_sec_person + neutral#accusations_third_person + 
embeddings = np.array(model.encode(sentences))
def generate_reason(string):
    input_embedding = np.array(model.encode(string))
    distances = np.linalg.norm(embeddings-input_embedding, axis=1)
    index = np.argmin(distances)
    listt = int(index/12)
    index = index % 12
    return [index,listt]

In [119]:
test["generated_reason"] = -1
i = 0
matrix = np.zeros((12,12))
lists_used = np.zeros((2,5))
matches=0
gen_reasons = []
for index, row in test.iterrows():
    i+=1
    gen_reas = generate_reason(row["completion"])
    generated_reason = gen_reas[0]
    gen_reasons += [gen_reas[0]]
    my_reason = row["reason"]
    matrix[my_reason,generated_reason]+=1
    
    match = generated_reason==my_reason
    matches+=match
    lists_used[int(match),gen_reas[1]]+=1

print(matches)
print(lists_used)

53
[[41. 20. 67. 19.  0.]
 [16.  9. 20.  8.  0.]]


In [110]:
for i in range(12):
    for j in range(12):
        if i!=j:
            if matrix[i,j]>3:
                print(categories[i]+" -> "+categories[j]+"  =  "+str(matrix[i,j]))
        else:
            if matrix[i,j]<4:
                print("True " + categories[i] + "  =  " + str(matrix[i,j]))

Irresponsible -> Insecurity  =  5.0
Entitled -> Irresponsible  =  5.0
Entitled -> Insecurity  =  9.0
Entitled -> Rude  =  7.0
True Stupid  =  1.0
True Bigot  =  3.0
Bigot -> Insecurity  =  6.0
True Miscommunication  =  0.0
True Trauma  =  1.0
Betrayal -> Entitled  =  4.0
Betrayal -> Insecurity  =  4.0
Danger -> Trauma  =  4.0
True Obsession  =  3.0
True Pride  =  2.0
Rude -> Insecurity  =  8.0


In [167]:
print(test)

                                             completion  reason   
4571   because it seems like a reasonable request, b...       1  \
1509   . You lied to her so she has the right to be ...       6   
5028   - Sounds like the owner just wanted to get ri...       0   
1417   . This isn't everyday shit we're talking abou...       7   
2247   . I would have been fascinated too. At 6 week...       8   
...                                                 ...     ...   
2083   . Is your cousin an actual child? I’d probabl...      11   
255    The school had a requirement and he failed to...       1   
1593   . Tell her she needs to pay a company to box ...       0   
649    . Thank you for standing up for your daughter...       3   
1965   . I'm sure it's annoying, yeah. But you offer...       6   

      generated_reason  
4571                11  
1509                10  
5028                10  
1417                 5  
2247                 8  
...                ...  
2083                

In [115]:
print(matrix)

[[ 6.  0.  0.  1.  0.  1.  1.  2.  1.  0.  5.  3.]
 [ 5. 11.  1.  1.  3.  1.  2.  1.  0.  0.  9.  7.]
 [ 0.  1.  1.  1.  3.  0.  1.  0.  1.  1.  1.  3.]
 [ 0.  0.  0.  3.  1.  1.  0.  1.  0.  0.  6.  1.]
 [ 3.  2.  0.  0.  0.  0.  1.  1.  1.  0.  2.  2.]
 [ 2.  0.  0.  0.  1.  1.  1.  0.  1.  0.  0.  0.]
 [ 1.  4.  0.  0.  0.  1. 11.  0.  0.  0.  4.  1.]
 [ 3.  2.  0.  0.  0.  4.  2.  5.  1.  0.  2.  0.]
 [ 0.  1.  0.  0.  1.  1.  1.  1.  3.  0.  0.  0.]
 [ 2.  3.  0.  0.  0.  1.  0.  0.  2.  2.  1.  1.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  4.  0.]
 [ 2.  1.  0.  1.  2.  3.  3.  2.  1.  0.  8.  6.]]


In [179]:
gen_reasons_unproc = []
data = pd.concat([nta,yta])
data.drop(["prompt"], axis=1, inplace=True)
data = preprocess(data, remove_judgement=False)
data["processed"] = data['completion'].str.replace(r'[N,n,Y,y][T,t][A,a]', '', 1, regex=True) 
data['processed'] = data['processed'].str.replace(r'\s+', ' ', regex=True)
unprocessed_list = []
improved = 0
worse = 0

same = 0
for index, row in test.iterrows():
    processed = row["completion"]
    try:
        unprocessed = data.loc[data["processed"]==processed, "completion"].values[0]
    except:
        print(processed)
    unprocessed_list += [unprocessed]
    new_reason = generate_reason(unprocessed)[0]
    if new_reason == row["generated_reason"]:
        same+=1
    else:
        if new_reason==row["reason"]:
            improved+=1
        elif row["generated_reason"]==row["reason"]:
            worse+=1
print(improved)
print(worse)
print(same)
    
    #gen_reasons

 (but you’re parents and brother are). It’s your apartme your parents can’t force your boyfriend out. Also your brother should be respectful of you guys, especially since you’re cohabitating 
 You're being insecure and putting far too much stock into a piece of art. She's told you that theres no romantic attachme she has given valid reasons why she wants it, yet you are still bent out of shape. 
 . Do not go ahead with this. You aren't a murderer if you don't donate your kidney. You can coct the doctor and tell them that you don't want to do it. They'll tell your dad that you aren't a good candidate. They will take the blame. Please have them take the blame!! One of his other relatives can donate their kidney. If they're not a match, they can be tested to match someone else and the kidney donations will be exchanged. 
 . Wedding dress shopping is stressful. I know from experience. The more opinions, the more stressful it gets. I also do think it is a special mother-daughter mome that m

In [174]:
test["unprocessed"]=unprocessed_list
test["more_processed"] = test["completion"].str.replace(punctuation,' ')
test["more_processed"]=test["more_processed"].str.replace(r'\s+', ' ', regex=True)
test

Unnamed: 0,completion,reason,generated_reason,unprocessed,more_processed
4571,"because it seems like a reasonable request, b...",1,11,NTA because it seems like a reasonable reques...,"because it seems like a reasonable request, b..."
1509,. You lied to her so she has the right to be ...,6,10,YTA. You lied to her so she has the right to ...,. You lied to her so she has the right to be ...
5028,- Sounds like the owner just wanted to get ri...,0,10,NTA - Sounds like the owner just wanted to ge...,- Sounds like the owner just wanted to get ri...
1417,. This isn't everyday shit we're talking abou...,7,5,NTA. This isn't everyday shit we're talking a...,. This isn't everyday shit we're talking abou...
2247,. I would have been fascinated too. At 6 week...,8,8,NTA. I would have been fascinated too. At 6 w...,. I would have been fascinated too. At 6 week...
...,...,...,...,...,...
2083,. Is your cousin an actual child? I’d probabl...,11,6,NTA. Is your cousin an actual child? I’d prob...,. Is your cousin an actual child? I’d probabl...
255,The school had a requirement and he failed to...,1,1,NTA The school had a requirement and he faile...,The school had a requirement and he failed to...
1593,. Tell her she needs to pay a company to box ...,0,0,NTA. Tell her she needs to pay a company to b...,. Tell her she needs to pay a company to box ...
649,. Thank you for standing up for your daughter...,3,5,NTA. Thank you for standing up for your daugh...,. Thank you for standing up for your daughter...


In [113]:
examine = test.loc[test["reason"].isin([11])]
original_prompts = 
for index, row in examine.iterrows():
    print(row["completion"])
    print(row["reason"])

 . You're not obligated to give her a place to stay, but she's well within her right to leave her husband for any reason, up to and including "I didn't want to be married to him any more." Blaming her choices on "pregnancy hormones" is really insulting. What's more, I bet dollars to doughnuts this was a "straw that broke the camel's back" situation, and your sister has been putting up with bullshit from her husband for a while. 
11
 , your sister is being hypocritical. She’s teasing you for “doing drugs” when the pen wasn’t in your possession (and it seems like you didn’t use it either) while she’s out here ACTUALLY doing drugs. 
11
 . He is being astonishingly insensitive. He can perfectly well find someone else to dump his insecurities on who he won't be implicitly insulting with every word. 
11
 for repeatedly saying your SIL is "not actually family" when she clearly is. What's your issue with her? 
11
 . you’re basically telling your girlfriend she’d look better if she was thinner,