In [1]:
import json
import spacy
import string
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from collections import Counter
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from classifier_utils import ClassifierUtils
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import importlib

In [2]:
nlp = spacy.load('en')
nlp_light = spacy.load('en', disable=['parser', 'tagger', 'ner'])
stop_words = spacy.lang.en.stop_words.STOP_WORDS
alphabet = string.ascii_lowercase
clf_utils = ClassifierUtils()

In [3]:
with open('../data/review_decisions.json') as f:
    data = json.load(f)

## Group by Revision

In [4]:
data_by_revision = dict()
for doc_id in data:
    if('revisions') in data[doc_id]:
        for revision in data[doc_id]['revisions']:
            if(revision not in data_by_revision):
                data_by_revision[revision] = dict()
            if('reviews' in data[doc_id]['revisions'][revision]):
                for review in data[doc_id]['revisions'][revision]['reviews']:
                    review_obj = data[doc_id]['revisions'][revision]['reviews'][review]
                    if('text' in review_obj and 'decision' in review_obj):
                        if(len(review_obj['text'].strip()) > 0 and len(review_obj['decision']) > 0):
                            if(review_obj['decision'] not in data_by_revision[revision]):
                                data_by_revision[revision][review_obj['decision']] = list()
                            data_by_revision[revision][review_obj['decision']].append(review_obj['text'])

In [5]:
documents = data_by_revision['0']['Reject'] + data_by_revision['0']['Accept'] + data_by_revision['0']['Major Revision'] + data_by_revision['0']['Minor Revision']
labels = [0]*len(data_by_revision['0']['Reject']) + [1]*len(data_by_revision['0']['Accept']) + [2]*len(data_by_revision['0']['Major Revision']) + [3]*len(data_by_revision['0']['Minor Revision'])

## Naive Bayes Classification

In [None]:
all_metrics_nb = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels)
    all_metrics_nb.append(metrics)

In [None]:
print("Reject")
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['0']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['0']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['0']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Accept")
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['1']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['1']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['1']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Major Revision")
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['2']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['2']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['2']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Minor Revision")
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['3']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['3']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_nb[idx2][idx]['3']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))

In [None]:
ordered_features = clf_utils.get_nb_features(documents, labels)

In [None]:
for feature in ordered_features[0][:20]:
    print(feature)

In [None]:
for feature in ordered_features[1][:20]:
    print(feature)

In [None]:
for feature in ordered_features[2][:20]:
    print(feature)

In [None]:
for feature in ordered_features[3][:20]:
    print(feature)

## Logistic Regression

In [None]:
all_metrics_lr_ovr = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='LR')
    all_metrics_lr_ovr.append(metrics)

In [None]:
print("Reject")
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['0']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['0']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['0']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Accept")
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['1']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['1']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['1']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Major Revision")
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['2']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['2']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['2']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Minor Revision")
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['3']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['3']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_ovr[idx2][idx]['3']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))

In [None]:
"""
These are features from One vs Rest
"""
ordered_features = clf_utils.get_lr_features(documents, labels, multi_class='ovr')

In [None]:
for feature in ordered_features[0][:20]:
    print(feature)

In [None]:
for feature in ordered_features[1][:20]:
    print(feature)

In [None]:
for feature in ordered_features[2][:20]:
    print(feature)

In [None]:
for feature in ordered_features[3][:20]:
    print(feature)

In [None]:
all_metrics_lr_mn = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='LR', multi_class='multinomial')
    all_metrics_lr_mn.append(metrics)

In [None]:
print("Reject")
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['0']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['0']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['0']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Accept")
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['1']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['1']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['1']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Major Revision")
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['2']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['2']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['2']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))
print("Minor Revision")
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['3']['precision'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['3']['recall'] for idx in range(5)]) for idx2 in range(10)]))
print(np.mean([np.mean([all_metrics_lr_mn[idx2][idx]['3']['f1-score'] for idx in range(5)]) for idx2 in range(10)]))

In [None]:
"""
These are features from Multinomial Logistic Regression
"""
ordered_features = clf_utils.get_lr_features(documents, labels, multi_class='multinomial')

In [None]:
for feature in ordered_features[0][:20]:
    print(feature)

In [None]:
for feature in ordered_features[1][:20]:
    print(feature)

In [None]:
for feature in ordered_features[2][:20]:
    print(feature)

In [None]:
for feature in ordered_features[3][:20]:
    print(feature)

## Generating WordClouds

In [None]:
wordcloud = dict()
wordcloud['Reject'] = WordCloud().generate(' '.join(data_by_revision['0']['Reject']))
wordcloud['Accept'] = WordCloud().generate(' '.join(data_by_revision['0']['Accept']))
wordcloud['Major Revision'] = WordCloud().generate(' '.join(data_by_revision['0']['Major Revision']))
wordcloud['Minor Revision'] = WordCloud().generate(' '.join(data_by_revision['0']['Minor Revision']))

In [None]:
plt.imshow(wordcloud['Reject'], interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
plt.imshow(wordcloud['Accept'], interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
plt.imshow(wordcloud['Major Revision'], interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
plt.imshow(wordcloud['Minor Revision'], interpolation='bilinear')
plt.axis('off')
plt.show()

## Univariate Correlation

In [None]:
count_vectorizer = CountVectorizer(tokenizer=clf_utils.tokenizer)
count_vectorizer.fit(documents)

In [None]:
document_features = count_vectorizer.transform(documents).toarray()

In [None]:
boolean_reject = np.array(labels) == np.array([0]*len(labels))
int_reject = np.array([int(value) for value in boolean_reject])
boolean_accept = np.array(labels) == np.array([1]*len(labels))
int_accept = np.array([int(value) for value in boolean_accept])
boolean_major = np.array(labels) == np.array([2]*len(labels))
int_major = np.array([int(value) for value in boolean_major])
boolean_minor = np.array(labels) == np.array([3]*len(labels))
int_minor = np.array([int(value) for value in boolean_minor])

In [None]:
document_features.shape

In [None]:
feature_names = count_vectorizer.get_feature_names()
correlations_reject = [np.corrcoef(document_features[:,idx], int_reject)[0][1] for idx in range(len(feature_names))]
correlations_accept = [np.corrcoef(document_features[:,idx], int_accept)[0][1] for idx in range(len(feature_names))]
correlations_major = [np.corrcoef(document_features[:,idx], int_major)[0][1] for idx in range(len(feature_names))]
correlations_minor = [np.corrcoef(document_features[:,idx], int_minor)[0][1] for idx in range(len(feature_names))]

In [None]:
sorted_feature_names_reject = [feature_names[idx] for idx in np.argsort(correlations_reject)][::-1]
sorted_feature_names_accept = [feature_names[idx] for idx in np.argsort(correlations_accept)][::-1]
sorted_feature_names_major = [feature_names[idx] for idx in np.argsort(correlations_major)][::-1]
sorted_feature_names_minor = [feature_names[idx] for idx in np.argsort(correlations_minor)][::-1]
sorted_correlations_reject = np.sort(correlations_reject)[::-1]
sorted_correlations_accept = np.sort(correlations_accept)[::-1]
sorted_correlations_major = np.sort(correlations_major)[::-1]
sorted_correlations_minor = np.sort(correlations_minor)[::-1]

In [None]:
for idx, feature in enumerate(sorted_feature_names_reject[:20]):
    print(feature, sorted_correlations_reject[idx])

In [None]:
for idx, feature in enumerate(sorted_feature_names_accept[:20]):
    print(feature, sorted_correlations_accept[idx])

In [None]:
for idx, feature in enumerate(sorted_feature_names_major[:20]):
    print(feature, sorted_correlations_major[idx])

In [None]:
for idx, feature in enumerate(sorted_feature_names_minor[:20]):
    print(feature, sorted_correlations_minor[idx])

## Sanity Check

In [None]:
marked_ids = list()
for idx, document in tqdm(enumerate(documents)):
    flag = 0
    for token in nlp_light(document):
        if(token.text == 'co2-sensing'):
            flag = 1
            break
    if(flag == 1):
        marked_ids.append(idx)

In [None]:
marked_ids

In [None]:
counter = Counter()
for idx in marked_ids:
    for token in nlp_light(documents[idx]):
        if(token.text == 'co2-sensing'):
            counter[idx] += 1

In [None]:
counter

In [None]:
labels[3638]

In [None]:
sum(boolean_accept)

In [None]:
sum(int_accept)

In [None]:
max(correlations_accept)

## Sampling Reviews

In [None]:
selected_decisions = list()
selected_reviews = list()

In [None]:
decisions = ['Reject', 'Accept', 'Minor Revision', 'Major Revision']
decision = random.choice(decisions)
selected_decisions.append(decision)
review = np.random.choice(data_by_revision['0'][decision])
selected_reviews.append(review)

In [None]:
review

In [None]:
ind=9
print(selected_decisions[ind])
selected_reviews[ind]

In [None]:
selected_decisions[10:]

## Better Feature Selection

In [6]:
all_metrics = dict()
all_metrics['NB'] = dict()
all_metrics['LR'] = dict()
all_metrics['RF'] = dict()

In [None]:
all_metrics['NB']['WC_TT'] = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='NB', features='count', 
                                      normalize=True, binary=True)
    all_metrics['NB']['WC_TT'].append(metrics)

In [None]:
all_metrics['NB']['WC_TF'] = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='NB', features='count', 
                                      normalize=True, binary=False)
    all_metrics['NB']['WC_TF'].append(metrics)

In [None]:
all_metrics['NB']['WC_FF'] = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='NB', features='count', 
                                      normalize=False, binary=False)
    all_metrics['NB']['WC_TT'].append(metrics)

In [None]:
all_metrics['LR']['WC_TT'] = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='LR', features='count', 
                                      normalize=True, binary=True)
    all_metrics['LR']['WC_TT'].append(metrics)

In [7]:
all_metrics['LR']['WC_TF'] = list()
for _ in range(10):
    metrics = clf_utils.cross_validate(documents, labels, clf_type='LR', features='count', 
                                      normalize=True, binary=False)
    all_metrics['LR']['WC_TF'].append(metrics)

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
vectorizer = CountVectorizer(tokenizer = clf_utils.tokenizer)

In [9]:
vectorizer.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method ClassifierUtils.tokenizer of <classifier_utils.ClassifierUtils object at 0x1a2e9860b8>>,
        vocabulary=None)

In [10]:
x_train = vectorizer.transform(documents)

In [19]:
sum_ = np.sum(x_train, axis=1)
x_train = x_train/sum_

  return np.true_divide(self.todense(), other)


In [20]:
clf = MultinomialNB()

In [27]:
nan_positions = np.argwhere(np.isnan(x_train))

In [31]:
np.shape(nan_positions)

(206330, 2)

In [37]:
nan_positions

array([[  4210,      0],
       [  4210,      1],
       [  4210,      2],
       ...,
       [ 10313, 103162],
       [ 10313, 103163],
       [ 10313, 103164]])

In [39]:
documents[10313]

' '