In [None]:
# !git clone https://github.com/NLPrinceton/SARC.git
# !git clone https://github.com/NLPrinceton/text_embedding.git

### Currently working on the balanced section for political comments

### 1. Implementing the Baseline (amazon glove 1600d)

In [None]:
!mkdir SARC/pol
%cd SARC/pol

In [None]:
!wget http://nlp.cs.princeton.edu/SARC/2.0/pol/test-balanced.csv.bz2
!wget http://nlp.cs.princeton.edu/SARC/2.0/pol/train-balanced.csv.bz2
!wget http://nlp.cs.princeton.edu/SARC/2.0/pol/comments.json.bz2
!bzip2 -d *.bz2

In [None]:
%cd ../../

In [6]:
!ls

SARC			    sarc-build.ipynb
Untitled.ipynb		    sarc-implement-Copy1.ipynb
__init__.py		    sarc-implement.ipynb
__pycache__		    sarutils.py
bert			    temp
core			    test
flair-benchmark-prepare.py  test-balanced.csv
model-test.py		    test-unbalanced.csv
models			    text_embedding
r.py			    train-balanced-sarc.csv.gz
rhp.py			    train-balanced-sarcasm.csv
run-flair.py		    twitter-multi
sar.csv


In [7]:
# !mkdir test
%cd test

/data/nlp/reddit/sarcasm/test


In [None]:
!pip install nltk --user

In [17]:
import csv
import pandas as pd
import json

import argparse
import nltk
from sklearn.linear_model import LogisticRegressionCV as LogitCV
from sklearn.preprocessing import normalize
from text_embedding.features import *
from text_embedding.vectors import *
from SARC.utils import *

In [19]:
def load_sarc_responses(train_file, test_file, comment_file, lower=True):
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    with open(comment_file, 'r') as f:
        comments = json.load(f)
    train_docs = {'ancestors': [], 'responses': []}
    train_ids = {'ancestors':[], 'responses':[]}
    train_labels = []
    with open(train_file, 'r') as f:
        reader = csv.reader(f, delimiter='|')
        for row in reader:
            ancestors = row[0].split(' ')
            responses = row[1].split(' ')
            train_ids['ancestors'].append([r for r in ancestors])
            train_ids['responses'].append([r for r in responses])
            labels = row[2].split(' ')
            if lower:
                train_docs['ancestors'].append([comments[r]['text'].lower() for r in ancestors])
                train_docs['responses'].append([comments[r]['text'].lower() for r in responses])
            else:
                train_docs['ancestors'].append([comments[r]['text'] for r in ancestors])
                train_docs['responses'].append([comments[r]['text'] for r in responses])
            train_labels.append(labels)
            
        
    test_docs = {'ancestors': [], 'responses': []}
    test_labels = []
    test_ids = {'ancestors':[], 'responses':[]}
    with open(test_file, 'r') as f:
        reader = csv.reader(f, delimiter='|')
        for row in reader:
            ancestors = row[0].split(' ')
            responses = row[1].split(' ')
            test_ids['ancestors'].append([r for r in ancestors])
            test_ids['responses'].append([r for r in responses])
            labels = row[2].split(' ')
            if lower:
                test_docs['ancestors'].append([comments[r]['text'].lower() for r in ancestors])
                test_docs['responses'].append([comments[r]['text'].lower() for r in responses])
            else:
                test_docs['ancestors'].append([comments[r]['text'] for r in ancestors])
                test_docs['responses'].append([comments[r]['text'] for r in responses])
            test_labels.append(labels)

    return train_docs, test_docs, train_labels, test_labels, train_ids, test_ids


In [11]:
def parse():
    d = dict()
    d['dataset'] = 'pol'
    d['embedding'] = 'SARC/amazon_glove1600.txt'
    d['lower'] = True
    d['embed'] = True
    return d

In [12]:
!ls SARC/pol

comments.json  test-balanced.csv  train-balanced.csv


In [13]:
!ls -lh SARC/

total 8.1G
-rwxr-xr-x 1 ragarwal users 1.1K Jan 29 17:12 LICENSE
-rwxr-xr-x 1 ragarwal users 1.5K Jan 29 17:12 README.md
-rwxr-xr-x 1 ragarwal users   33 Jan 29 17:12 __init__.py
drwxr-xr-x 2 ragarwal users 4.0K Jan 29 17:17 __pycache__
-rw-r--r-- 1 ragarwal users 8.1G Feb 14  2018 amazon_glove1600.txt
-rwxr-xr-x 1 ragarwal users 5.1K Jan 29 17:12 eval.py
drwxr-xr-x 2 ragarwal users 4.0K Jan 29 17:16 pol
-rwxr-xr-x 1 ragarwal users 2.2K Jan 29 17:12 utils.py


In [20]:
args = parse()

SARC = 'SARC/'
SARC_POL = SARC + 'pol/'
SARC_MAIN = SARC + 'main/'
if args['dataset'].lower() == 'pol':
    SARC = SARC_POL
elif args['dataset'].lower() == 'main':
    SARC = SARC_MAIN

train_file = SARC+'train-balanced.csv'
test_file = SARC+'test-balanced.csv'
comment_file = SARC+'comments.json'

  # Load SARC pol/main sequences with labels.
print('Load SARC data')
train_seqs, test_seqs, train_labels, test_labels, train_ids, test_ids = load_sarc_responses(
    train_file, test_file, comment_file, lower=args['lower'])

Load SARC data


In [21]:
# Only use responses for this method. Ignore ancestors.
train_resp = train_seqs['responses']
test_resp = test_seqs['responses']

In [23]:
# Split into first and second responses and their labels.
# {0: list_of_first_responses, 1: list_of_second_responses}
train_docs = {i: [l[i] for l in train_resp] for i in range(2)}
test_docs = {i: [l[i] for l in test_resp] for i in range(2)}
train_labels = {i: [2*int(l[i])-1 for l in train_labels] for i in range(2)}
test_labels = {i: [2*int(l[i])-1 for l in test_labels] for i in range(2)}

In [28]:
# Train a classifier on all responses in training data. We will later use this
# classifier to determine for every sequence which of the 2 responses is more sarcastic.
train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])
test_all_docs_tok = tokenize(test_docs[0] + test_docs[1])
train_all_labels = np.array(train_labels[0] + train_labels[1])
test_all_labels = np.array(test_labels[0] + test_labels[1])

In [None]:
# Bongs or embeddings.
if args['embed']:
    print('Create embeddings')
    weights = None
#     if args.weights == 'sif':
#         weights = sif_weights(train_all_docs_tok, 1E-3)
#     if args.weights == 'snif':
#         weights = sif_weights(train_all_docs_tok, 1E-3)
#         weights = {f: 1-w for f, w in weights.items()}
    w2v = vocab2vecs({word for doc in train_all_docs_tok+test_all_docs_tok for word in doc}, vectorfile=args['embedding'])
    train_all_vecs = docs2vecs(train_all_docs_tok, f2v=w2v, weights=weights)
    test_all_vecs = docs2vecs(test_all_docs_tok, f2v=w2v, weights=weights)

print('Dimension of representation: %d'%train_all_vecs.shape[1])

In [None]:
print('Evaluate the classifier on all responses')
clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=0) 
clf.fit(train_all_vecs, train_all_labels)
print('\tTrain acc: ', clf.score(train_all_vecs, train_all_labels))
print('\tTest acc: ', clf.score(test_all_vecs, test_all_labels))

### 2. Implementing Subword Features with Byte-Pair Encoding

In [None]:
from bpemb import BPEmb

In [None]:
bpemb_en = BPEmb(lang="en", dim=100, vs=100000)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [54]:
vectorizer = TfidfVectorizer(tokenizer=bpemb_en.encode,ngram_range=(1,2))

In [55]:
X_train = vectorizer.fit_transform(train_docs[0]+train_docs[1])

In [56]:
X_test = vectorizer.transform(test_docs[0]+test_docs[1])

In [57]:
X_train[0]

<1x98084 sparse matrix of type '<class 'numpy.float64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [58]:
print('Evaluate the classifier on all responses')
clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*X_train.shape), solver='liblinear', n_jobs=-1, random_state=0) 
clf.fit(X_train, train_all_labels)
print('\tTrain acc: ', clf.score(X_train, train_all_labels))
print('\tTest acc: ', clf.score(X_test, test_all_labels))

Evaluate the classifier on all responses
	Train acc:  0.8938396254023998
	Test acc:  0.7025836758661186
