# Final Project: Can AI be Ethical?
Khushi Magiawala, Rahul Rajan, Shruthi Sundar

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
try:
 from google.colab import drive
  IN_COLAB=True
except:
 IN_COLAB=False

if IN_COLAB:
 print("We're running Colab")

IndentationError: unexpected indent (<ipython-input-4-2f2224ff3fb6>, line 3)

In [6]:
%pip install jsonlines


The following command must be run outside of the IPython shell:

    $ pip install jsonlines

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [1]:
import jsonlines
import pandas as pd

def prepare_data(filename):
  df = pd.DataFrame()
  def rename_field(obj):
    key = 'immoral_action' if 'immoral_action' in obj else 'moral_action'
    obj['action'] = obj.pop(key)
    # print(obj)
    return obj
  with jsonlines.open(filename) as f:
      data = [rename_field(x) for x in f.iter()]
      df = pd.DataFrame(data)
  print(df['label'])
  return df
    

In [22]:
train_df = prepare_data('train.jsonl')
test_df = prepare_data('test.jsonl')
dev_df = prepare_data('dev.jsonl')

train_df['label'].apply(lambda x: 0 if x == '0' else 1)
test_df['label'].apply(lambda x: 0 if x == '0' else 1)
dev_df['label'].apply(lambda x: 0 if x == '0' else 1)
print(train_df)

0        1
1        0
2        1
3        0
4        1
        ..
19995    0
19996    1
19997    0
19998    1
19999    0
Name: label, Length: 20000, dtype: object
0       1
1       0
2       1
3       0
4       1
       ..
1995    0
1996    1
1997    0
1998    1
1999    0
Name: label, Length: 2000, dtype: object
0       1
1       0
2       1
3       0
4       1
       ..
1995    0
1996    1
1997    0
1998    1
1999    0
Name: label, Length: 2000, dtype: object
                                    ID  \
0      37XITHEISX7HXBGAFQTNMO7R7TSRCQ1   
1      37XITHEISX7HXBGAFQTNMO7R7TSRCQ0   
2      3TDXMTX3CCSFOV4QFCMBMWEQPJGI6K1   
3      3TDXMTX3CCSFOV4QFCMBMWEQPJGI6K0   
4      3QAPZX2QN5BGZZ49OKL2QFXIMJ502V1   
...                                ...   
19995  3LO69W1SU4BJBP12CSSAQ3J70IWGLF0   
19996  3HOSI13XH0MX1T3JXLTE1585EO1DDN1   
19997  3HOSI13XH0MX1T3JXLTE1585EO1DDN0   
19998  3CFJTT4SXUOYSVIVYMR3SAEMHEOI781   
19999  3CFJTT4SXUOYSVIVYMR3SAEMHEOI780   

                              

In [5]:
print(train_df.columns)

Index(['ID', 'norm', 'situation', 'intention', 'moral_consequence', 'label',
       'action', 'immoral_consequence'],
      dtype='object')


In [6]:
import torch
import random
import numpy as np
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
import pandas as pd
from src.preprocess import clean_text 
import nltk
from tqdm import tqdm

nltk.download('punkt')
train_df["action_tokenized"] = train_df["action"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
test_df["action_tokenized"] = test_df["action"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
dev_df["action_tokenized"] = dev_df["action"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

train_df["situation_tokenized"] = train_df["situation"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
test_df["situation_tokenized"] = test_df["situation"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
dev_df["situation_tokenized"] = dev_df["situation"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

train_df["norm_tokenized"] = train_df["norm"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
test_df["norm_tokenized"] = test_df["norm"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
dev_df["norm_tokenized"] = dev_df["norm"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

[nltk_data] Downloading package punkt to /Users/rajan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from src.dataset import split_train_val_test, generate_vocab_map


In [11]:
train_vocab, reverse_vocab = generate_vocab_map(train_df)
test_vocab, reverse_vocab = generate_vocab_map(test_df)
dev_vocab, reverse_vocab = generate_vocab_map(dev_df)

action_vocab, __ = generate_vocab_map(train_df)
situation_vocab, ___ = generate_vocab_map(train_df)
intention_vocab, __ = generate_vocab_map(train_df)
train_df['label'] = train_df['label'].apply(lambda x:np.int64(int(x)))
test_df['label'] = test_df['label'].apply(lambda x:np.int64(int(x)))
dev_df['label'] = dev_df['label'].apply(lambda x:np.int64(int(x)))

In [13]:
from src.dataset import MoralStoriesDataset
from torch.utils.data import RandomSampler

train_dataset = MoralStoriesDataset(train_vocab, train_df)
val_dataset = MoralStoriesDataset(train_vocab, dev_df)
test_dataset = MoralStoriesDataset(train_vocab, test_df)

train_sampler = RandomSampler(train_dataset)
val_sampler = RandomSampler(val_dataset)
test_sampler = RandomSampler(test_dataset)

In [14]:
from torch.utils.data import DataLoader
from src.dataset import collate_fn
BATCH_SIZE = 64

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

Modeling

In [15]:
for a,s,n,l in test_iterator:
    print(a,s,n,l)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

tensor([[1174,   51,    8,  ...,    0,    0,    0],
        [ 443, 1058,   14,  ...,    0,    0,    0],
        [ 812,  171,   99,  ...,    0,    0,    0],
        ...,
        [ 307,  165,    4,  ...,    0,    0,    0],
        [   1,  574,  229,  ...,    0,    0,    0],
        [5356,  165,    4,  ...,    0,    0,    0]]) tensor([[1174,   71,  213,  ...,    0,    0,    0],
        [ 443,   86, 5295,  ...,    0,    0,    0],
        [ 812,  655,   49,  ...,    0,    0,    0],
        ...,
        [ 307,   10,   49,  ...,    0,    0,    0],
        [  73, 3316, 3655,  ...,    0,    0,    0],
        [5356,  126, 4950,  ...,    0,    0,    0]]) tensor([[ 119,   71,  333,    4,   92, 5037,  571,    4,   47, 1305,  943,   16],
        [ 215,  260,   20,  149,  293, 5684,  400,  215,  252, 4554,  119,   16],
        [ 119,   71,  279,    4, 3571,  314,  600,  372, 1104,    1,   16,    0],
        [ 119,   71, 4376,    4,  455,   84, 1510,  877, 6583,   16,    0,    0],
        [1025,   14,

In [19]:
import numpy as np
import gensim.downloader as api

def download_word2vec_embeddings():
    print("Downloading pre-trained word embeddings from: word2vec-google-news-300.\n" 
          + "Note: This can take a few minutes.\n")
    wv = api.load("word2vec-google-news-300")
    print("\nLoading complete!\n" +
          "Vocabulary size: {}".format(len(wv.vocab)))
    return wv

word_vectors = download_word2vec_embeddings()

Downloading pre-trained word embeddings from: word2vec-google-news-300.
Note: This can take a few minutes.



FileNotFoundError: [Errno 2] No such file or directory: '/Users/rajan/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'

In [None]:
EMBEDDING_VECTOR_LENGTH = 300
import itertools

def construct_embedding_matrix(vocab):
  embedding_dict = {}
  c = 0
  
  for word in vocab.keys():
    if word in word_vectors:
      vector = word_vectors[word]
      embedding_dict[word] = vector
    else:
      c+=1


  num_words = len(vocab) + 1
  embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))
  for word,i in tqdm(vocab.items()):
    vect=embedding_dict.get(word, [])
    if len(vect)>0:
        embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
  return embedding_matrix

action_embedding_matrix = construct_embedding_matrix(action_vocab)
situation_embedding_matrix = construct_embedding_matrix(situation_vocab)
intention_embedding_matrix = construct_embedding_matrix(intention_vocab)

(8312, 300)


100%|██████████| 8311/8311 [00:00<00:00, 460201.20it/s]


(8312, 300)


100%|██████████| 8311/8311 [00:00<00:00, 420665.42it/s]


(8312, 300)


100%|██████████| 8311/8311 [00:00<00:00, 431517.67it/s]


In [None]:
from src.models import ClassificationModel

 
model = ClassificationModel(len(action_vocab),len(situation_vocab), len(intention_vocab), action_embedding_matrix, situation_embedding_matrix, intention_embedding_matrix, 
                            embedding_dim=300, hidden_dim=16, num_layers=2, bidirectional=True)
model = model.to(device)

In [None]:
from torch.optim import AdamW

criterion, optimizer = None, None
criterion = torch.nn.BCELoss()
optimizer = AdamW([param for param in model.parameters() if param.requires_grad == True], lr=0.003)
# train

In [None]:

# returns the total loss calculated from criterion
def train_loop(model, criterion, iterator):
    model.train()
    total_loss = 0
    true, pred = [], []
    for a, s, n, y in tqdm(iterator):
        optimizer.zero_grad()
        outputs = model(a.squeeze().to(device), s.squeeze().to(device), n.squeeze().to(device))
        loss = criterion(outputs.squeeze(), y.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        true.extend(y.cpu() > 0.5)
        pred.extend(outputs.cpu() > 0.5)
    print(f"TRAIN F-1: {binary_macro_f1(true, pred)}")
    print(f"TRAIN ACC: {accuracy(true, pred)}")
    return total_loss
    
def val_loop(model, criterion, iterator):
    true, pred = [], []
    model.eval()
    with torch.no_grad():
        for a, s, n, y in tqdm(iterator):
            output = model(a.squeeze().to(device), s.squeeze().to(device), n.squeeze().to(device))
            # convert output to boolean
            output = output.cpu() > 0.5
            # convert y to boolean
            y = y > 0.5
            true.extend(y)
            pred.extend(output)        
    return true, pred

In [None]:
from src.eval_utils import binary_macro_f1, accuracy
print(action_embedding_matrix.shape[0])
print(len(action_vocab))
true, pred = val_loop(model, criterion, val_iterator)
print(binary_macro_f1(true, pred))
print(accuracy(true, pred))

8312
8311


100%|██████████| 32/32 [00:01<00:00, 18.21it/s]


0.19980015988008795
0.5


In [None]:
TOTAL_EPOCHS = 32

for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, train_iterator)
    true, pred = val_loop(model, criterion, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 313/313 [00:20<00:00, 15.18it/s]


TRAIN F-1: 0.38213270562201523
TRAIN ACC: 0.6682


100%|██████████| 32/32 [00:01<00:00, 18.45it/s]


EPOCH: 0
TRAIN LOSS: 188.27190735936165
VAL F-1: 0.3891053553544145
VAL ACC: 0.678


100%|██████████| 313/313 [00:20<00:00, 15.37it/s]


TRAIN F-1: 0.4381998956190666
TRAIN ACC: 0.736


100%|██████████| 32/32 [00:01<00:00, 17.49it/s]


EPOCH: 1
TRAIN LOSS: 166.61050948500633
VAL F-1: 0.38460131408001913
VAL ACC: 0.675


100%|██████████| 313/313 [00:20<00:00, 15.10it/s]


TRAIN F-1: 0.4580531511609016
TRAIN ACC: 0.75965


100%|██████████| 32/32 [00:01<00:00, 17.76it/s]


EPOCH: 2
TRAIN LOSS: 155.5802584886551
VAL F-1: 0.41378655292910144
VAL ACC: 0.7075


100%|██████████| 313/313 [00:20<00:00, 15.43it/s]


TRAIN F-1: 0.4734677275335943
TRAIN ACC: 0.7779


100%|██████████| 32/32 [00:01<00:00, 18.84it/s]


EPOCH: 3
TRAIN LOSS: 147.09472382068634
VAL F-1: 0.4096899155462471
VAL ACC: 0.7025


100%|██████████| 313/313 [00:20<00:00, 15.49it/s]


TRAIN F-1: 0.4883234784918641
TRAIN ACC: 0.7954


100%|██████████| 32/32 [00:01<00:00, 16.88it/s]


EPOCH: 4
TRAIN LOSS: 138.13995519280434
VAL F-1: 0.408908992796841
VAL ACC: 0.7015


100%|██████████| 313/313 [00:20<00:00, 15.26it/s]


TRAIN F-1: 0.5038045235485614
TRAIN ACC: 0.81355


100%|██████████| 32/32 [00:01<00:00, 17.48it/s]


EPOCH: 5
TRAIN LOSS: 129.70765526592731
VAL F-1: 0.40188310029546814
VAL ACC: 0.693


100%|██████████| 313/313 [00:20<00:00, 15.62it/s]


TRAIN F-1: 0.5140374662117033
TRAIN ACC: 0.8255


100%|██████████| 32/32 [00:01<00:00, 18.42it/s]


EPOCH: 6
TRAIN LOSS: 122.08831351995468
VAL F-1: 0.41533723073860485
VAL ACC: 0.7095


100%|██████████| 313/313 [00:19<00:00, 15.87it/s]


TRAIN F-1: 0.524947210364032
TRAIN ACC: 0.8382


100%|██████████| 32/32 [00:01<00:00, 18.48it/s]


EPOCH: 7
TRAIN LOSS: 113.39611552655697
VAL F-1: 0.4099792269516478
VAL ACC: 0.703


100%|██████████| 313/313 [00:19<00:00, 15.76it/s]


TRAIN F-1: 0.535111973022233
TRAIN ACC: 0.85


100%|██████████| 32/32 [00:01<00:00, 19.16it/s]


EPOCH: 8
TRAIN LOSS: 104.02050440013409
VAL F-1: 0.41765838943968914
VAL ACC: 0.712


100%|██████████| 313/313 [00:19<00:00, 15.83it/s]


TRAIN F-1: 0.5477723179194767
TRAIN ACC: 0.86465


100%|██████████| 32/32 [00:01<00:00, 18.30it/s]


EPOCH: 9
TRAIN LOSS: 96.69280324876308
VAL F-1: 0.4155804072596819
VAL ACC: 0.7095


100%|██████████| 313/313 [00:19<00:00, 16.18it/s]


TRAIN F-1: 0.559390842426421
TRAIN ACC: 0.87805


100%|██████████| 32/32 [00:01<00:00, 18.83it/s]


EPOCH: 10
TRAIN LOSS: 88.68124684691429
VAL F-1: 0.40876983584092974
VAL ACC: 0.7015


100%|██████████| 313/313 [00:19<00:00, 16.16it/s]


TRAIN F-1: 0.5690830942198242
TRAIN ACC: 0.8892


100%|██████████| 32/32 [00:01<00:00, 18.49it/s]


EPOCH: 11
TRAIN LOSS: 81.98184674978256
VAL F-1: 0.4053993076718877
VAL ACC: 0.6975


100%|██████████| 313/313 [00:19<00:00, 16.05it/s]


TRAIN F-1: 0.5826853911967385
TRAIN ACC: 0.9048


100%|██████████| 32/32 [00:01<00:00, 19.14it/s]


EPOCH: 12
TRAIN LOSS: 72.72026448696852
VAL F-1: 0.4023473889379695
VAL ACC: 0.6935


100%|██████████| 313/313 [00:18<00:00, 16.50it/s]


TRAIN F-1: 0.5888875823076002
TRAIN ACC: 0.9119


100%|██████████| 32/32 [00:01<00:00, 18.95it/s]


EPOCH: 13
TRAIN LOSS: 66.39559469372034
VAL F-1: 0.4052782965214095
VAL ACC: 0.697


100%|██████████| 313/313 [00:19<00:00, 16.40it/s]


TRAIN F-1: 0.5994393338640082
TRAIN ACC: 0.92395


100%|██████████| 32/32 [00:01<00:00, 19.06it/s]


EPOCH: 14
TRAIN LOSS: 59.66755510866642
VAL F-1: 0.3998686227385087
VAL ACC: 0.6905


100%|██████████| 313/313 [00:19<00:00, 16.25it/s]


TRAIN F-1: 0.6026823191464904
TRAIN ACC: 0.92765


100%|██████████| 32/32 [00:01<00:00, 19.08it/s]


EPOCH: 15
TRAIN LOSS: 56.14432085305452
VAL F-1: 0.4052725484629005
VAL ACC: 0.697


100%|██████████| 313/313 [00:18<00:00, 16.70it/s]


TRAIN F-1: 0.6079496074603844
TRAIN ACC: 0.93365


100%|██████████| 32/32 [00:01<00:00, 19.23it/s]


EPOCH: 16
TRAIN LOSS: 51.948464039713144
VAL F-1: 0.4058003462510759
VAL ACC: 0.698


100%|██████████| 313/313 [00:18<00:00, 16.70it/s]


TRAIN F-1: 0.6166515321432209
TRAIN ACC: 0.94355


100%|██████████| 32/32 [00:02<00:00, 13.58it/s]


EPOCH: 17
TRAIN LOSS: 45.056876976042986
VAL F-1: 0.41350591711075124
VAL ACC: 0.707


100%|██████████| 313/313 [00:19<00:00, 16.36it/s]


TRAIN F-1: 0.618984004541941
TRAIN ACC: 0.9462


100%|██████████| 32/32 [00:01<00:00, 19.92it/s]


EPOCH: 18
TRAIN LOSS: 43.19782772473991
VAL F-1: 0.40732501683605205
VAL ACC: 0.6995


100%|██████████| 313/313 [00:18<00:00, 16.83it/s]


TRAIN F-1: 0.624137129286838
TRAIN ACC: 0.95205


100%|██████████| 32/32 [00:01<00:00, 19.58it/s]


EPOCH: 19
TRAIN LOSS: 37.65550612099469
VAL F-1: 0.4019508367109267
VAL ACC: 0.693


100%|██████████| 313/313 [00:18<00:00, 16.51it/s]


TRAIN F-1: 0.6321635412150873
TRAIN ACC: 0.96115


100%|██████████| 32/32 [00:01<00:00, 18.65it/s]


EPOCH: 20
TRAIN LOSS: 33.7419023104012
VAL F-1: 0.4015374357950908
VAL ACC: 0.6925


100%|██████████| 313/313 [00:19<00:00, 16.33it/s]


TRAIN F-1: 0.6332227852031737
TRAIN ACC: 0.96235


100%|██████████| 32/32 [00:01<00:00, 19.38it/s]


EPOCH: 21
TRAIN LOSS: 31.261708039790392
VAL F-1: 0.4089930927248435
VAL ACC: 0.7015


100%|██████████| 313/313 [00:18<00:00, 16.57it/s]


TRAIN F-1: 0.6349454644380409
TRAIN ACC: 0.9643


100%|██████████| 32/32 [00:01<00:00, 19.52it/s]


EPOCH: 22
TRAIN LOSS: 30.359575828537345
VAL F-1: 0.40606366662243504
VAL ACC: 0.698


100%|██████████| 313/313 [00:18<00:00, 16.93it/s]


TRAIN F-1: 0.6384794327887389
TRAIN ACC: 0.9683


100%|██████████| 32/32 [00:01<00:00, 19.23it/s]


EPOCH: 23
TRAIN LOSS: 25.87044286634773
VAL F-1: 0.40651047261382156
VAL ACC: 0.6985


100%|██████████| 313/313 [00:18<00:00, 16.54it/s]


TRAIN F-1: 0.641927982114277
TRAIN ACC: 0.9722


100%|██████████| 32/32 [00:01<00:00, 19.89it/s]


EPOCH: 24
TRAIN LOSS: 23.68355826800689
VAL F-1: 0.3941713076900881
VAL ACC: 0.6835


100%|██████████| 313/313 [00:18<00:00, 17.03it/s]


TRAIN F-1: 0.6409992359885823
TRAIN ACC: 0.97115


100%|██████████| 32/32 [00:01<00:00, 19.06it/s]


EPOCH: 25
TRAIN LOSS: 23.697382640093565
VAL F-1: 0.4060837315162572
VAL ACC: 0.698


100%|██████████| 313/313 [00:18<00:00, 16.95it/s]


TRAIN F-1: 0.6442725044591511
TRAIN ACC: 0.97485


100%|██████████| 32/32 [00:01<00:00, 18.87it/s]


EPOCH: 26
TRAIN LOSS: 21.54493332700804
VAL F-1: 0.41100733056626915
VAL ACC: 0.704


100%|██████████| 313/313 [00:18<00:00, 16.55it/s]


TRAIN F-1: 0.6490093677647807
TRAIN ACC: 0.9802


100%|██████████| 32/32 [00:01<00:00, 19.00it/s]


EPOCH: 27
TRAIN LOSS: 17.24943771515973
VAL F-1: 0.397795665314597
VAL ACC: 0.688


100%|██████████| 313/313 [00:18<00:00, 16.56it/s]


TRAIN F-1: 0.6453345609333684
TRAIN ACC: 0.97605


100%|██████████| 32/32 [00:01<00:00, 19.22it/s]


EPOCH: 28
TRAIN LOSS: 20.9697312428616
VAL F-1: 0.39919085768777596
VAL ACC: 0.69


100%|██████████| 313/313 [00:18<00:00, 16.79it/s]


TRAIN F-1: 0.6496738395238311
TRAIN ACC: 0.98095


100%|██████████| 32/32 [00:01<00:00, 19.71it/s]


EPOCH: 29
TRAIN LOSS: 16.370818838942796
VAL F-1: 0.40852700582325596
VAL ACC: 0.701


100%|██████████| 313/313 [00:18<00:00, 16.86it/s]


TRAIN F-1: 0.6444495473872209
TRAIN ACC: 0.97505


100%|██████████| 32/32 [00:01<00:00, 21.23it/s]


EPOCH: 30
TRAIN LOSS: 20.744258527178317
VAL F-1: 0.39209766326043755
VAL ACC: 0.681


100%|██████████| 313/313 [00:18<00:00, 16.72it/s]


TRAIN F-1: 0.6480792841757422
TRAIN ACC: 0.97915


100%|██████████| 32/32 [00:01<00:00, 20.41it/s]

EPOCH: 31
TRAIN LOSS: 17.70276849216316
VAL F-1: 0.39575173417060006
VAL ACC: 0.6855





Test Metrics

In [None]:
true, pred = val_loop(model, criterion, test_iterator)
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

100%|██████████| 32/32 [00:01<00:00, 20.29it/s]


TEST F-1: 0.3684843859745132
TEST ACC: 0.652


#### 2. Baseline Models 

The below section details implementation of our baseline models- a Naive Bayes classifier and Logistic Regression classifier

In [24]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
import jsonlines
pd.set_option('display.max_colwidth', -1)
import seaborn as sns
import nltk
import pandas as pd
nltk.download('wordnet')
#import scattertext as st
#import spacy

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

  pd.set_option('display.max_colwidth', -1)
[nltk_data] Downloading package wordnet to /Users/rajan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True



In [25]:
def filler(text): 
    if text.dtype is pd.np.dtype(float): #if in the column the data type is a float/int, it replaces w an empty string
        return text.fillna('')
    elif text.dtype is pd.np.dtype(int): 
        return text.fillna('')
    else:
        return text.fillna('NA')

In [26]:
train_df = train_df.apply(filler)
test_df = test_df.apply(filler)
dev_df = dev_df.apply(filler)

  if text.dtype is pd.np.dtype(float): #if in the column the data type is a float/int, it replaces w an empty string
  elif text.dtype is pd.np.dtype(int):


In [None]:
train_df.action=train_df.action.str.replace('[^a-zA-Z]', ' ')
test_df.action=train_df.action.str.replace('[^a-zA-Z]', ' ')
dev_df.action=train_df.action.str.replace('[^a-zA-Z]', ' ')

In [27]:
RE_PREPROCESS = r'\W+|\d+'  #raw string + alphanumeric character + one or more digits ( regular expressions)
train_df.action = np.array( [ re.sub(RE_PREPROCESS, ' ', answer).lower() for answer in train_df.action])
test_df.action = np.array( [ re.sub(RE_PREPROCESS, ' ', answer).lower() for answer in test_df.action])
dev_df.action = np.array( [ re.sub(RE_PREPROCESS, ' ', answer).lower() for answer in dev_df.action])

In [28]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
#WordNetLemmatizer returns the input word unchanged if it cannot be found in WordNet.
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [30]:
from sklearn.model_selection  import train_test_split
#train_df['label'] = train_df['label'].apply(lambda x: 0 if x == '0' else 1)
X = train_df.action
y = train_df['label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,stratify=train_df['label'],random_state=2,shuffle=True)

train_df.head(5)

Unnamed: 0,ID,norm,situation,intention,moral_consequence,label,action,immoral_consequence
0,37XITHEISX7HXBGAFQTNMO7R7TSRCQ1,It's good to do activities together with your spouse.,Jenny's husband asks her to go hiking with him like they used to do.,Jenny wants to tell her husband if she'll go hiking with him.,Jenny's husband is happy that they still share activities together and thinks the hike will be a chance to reconnect.,1,jenny agrees to go hiking with her husband and seems excited for the activity,
1,37XITHEISX7HXBGAFQTNMO7R7TSRCQ0,It's good to do activities together with your spouse.,Jenny's husband asks her to go hiking with him like they used to do.,Jenny wants to tell her husband if she'll go hiking with him.,,0,jenny says she doesn t feel like going hiking with her husband and will just stay home,Jenny's husband feels like they don't do things together any more and that they are growing apart.
2,3TDXMTX3CCSFOV4QFCMBMWEQPJGI6K1,It's good to show remorse for your actions,Claire is walking through the mall when she realizes that she has unintentionally let the door slam in a lady's face.,Claire wants to quickly get to her intended store.,The lady accepts Claire's apology.,1,claire reaches back opens the door and apologizes to the lady before heading on her way,
3,3TDXMTX3CCSFOV4QFCMBMWEQPJGI6K0,It's good to show remorse for your actions,Claire is walking through the mall when she realizes that she has unintentionally let the door slam in a lady's face.,Claire wants to quickly get to her intended store.,,0,claire looks away from the lady and continues to her destination,The lady thinks that Claire is a rude person.
4,3QAPZX2QN5BGZZ49OKL2QFXIMJ502V1,It's rude to lock someone in the bathroom.,Timothy lives alone with his dog in his house who is barking loudly because he wants to go out and walk.,Timothy doesn't want to hear his dog bark anymore.,Timothy's dog stops barking and is happier and healthy.,1,timothy puts a leash on his dog and takes him for a walk around the neighborhood,


In [31]:
get_ipython().system('pip install nltk')
nltk.download('punkt')
get_ipython().system('pip install scikit-learn')

vect=TfidfVectorizer(tokenizer=LemmaTokenizer(),stop_words='english',ngram_range=(1, 2))
vect.fit(X_train)

train=vect.transform(X_train) # transforming from string to vector (train and test models)
test=vect.transform(X_test)

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m


[nltk_data] Downloading package punkt to /Users/rajan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m




TfidfVectorizer(ngram_range=(1, 2), stop_words='english',
                tokenizer=<__main__.LemmaTokenizer object at 0x150ecba30>)

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn import metrics

logreg = LogisticRegression(penalty='l2', C=0.2)
logreg.fit(train, y_train)
y_pred = logreg.predict(test)

LogisticRegression(C=0.2)

In [34]:
y_true = y_test
#y_pred = output_log
precision_recall_fscore_support(y_true, y_pred, average='macro')
accuracy_score(y_true, y_pred)
#train_df[train_df['label']=='1']

(0.6927896664738771, 0.692, 0.6916842847075406, None)

0.692

In [35]:
nb = MultinomialNB(alpha=0.2)
nb.fit(train, y_train)
output_nb = nb.predict(test)

MultinomialNB(alpha=0.2)

In [40]:
y_true = y_test
y_pred = output_nb
precision_recall_fscore_support(y_true, y_pred, average='macro')
accuracy_score(y_true, y_pred)

(0.6565001565001565, 0.6565000000000001, 0.6564999141249785, None)

0.6565

In [37]:
rf = RandomForestClassifier(n_estimators = 25)
rf.fit(train,y_train)
output_rf = rf.predict(test)

RandomForestClassifier(n_estimators=25)

In [38]:
y_true = y_test
y_pred = output_rf
precision_recall_fscore_support(y_true, y_pred, average='macro')
accuracy_score(y_true, y_pred)

(0.6650237634219327, 0.665, 0.6649879395658245, None)

0.665