In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pickle
import numpy as np
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
from torch.utils import data
import copy
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [None]:
import numpy as np
import re
import string
import os.path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
import nltk
from nltk.corpus import stopwords
from itertools import chain
from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import conll2000
import pickle
import nltk
import nltk.data

from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
init_notebook_mode(connected=True)
import nltk
nltk.download('content')

[nltk_data] Error loading content: Package 'content' not found in
[nltk_data]     index


False

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = '/content/drive/MyDrive/minor-project'

In [None]:
chunker_instance = None
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tagger = nltk.data.load(PATH+"/pos_tagger/treebank_brill_aubt.pickle")

grammar = r"""
  NP: 
      {<DT|WDT|PP\$|PRP\$>?<\#|CD>*(<JJ|JJS|JJR><VBG|VBN>?)*(<NN|NNS|NNP|NNPS>(<''><POS>)?)+}
      {<DT|WDT|PP\$|PRP\$><JJ|JJS|JJR>*<CD>}
      {<DT|WDT|PP\$|PRP\$>?<CD>?(<JJ|JJS|JJR><VBG>?)}
      {<DT>?<PRP|PRP\$>}
      {<WP|WP\$>}
      {<DT|WDT>}
      {<JJR>}
      {<EX>}
      {<CD>+}
  VP: {<VBZ><VBG>}
      {(<MD|TO|RB.*|VB|VBD|VBN|VBP|VBZ>)+}
      

"""

def get_nltk_pos_tag_based_chunker():
    global chunker_instance
    if chunker_instance is not None:
        return chunker_instance
    chunker_instance = nltk.RegexpParser(grammar)
    return chunker_instance

    

def chunk_to_str(chunk):
    if type(chunk) is nltk.tree.Tree:
        return chunk.label()
    else:
        return chunk[1]

def extract_subtree_expansions(t, res):
    if type(t) is nltk.tree.Tree:
        expansion = t.label() + "[" + " ".join([chunk_to_str(child) for child in t]) + "]"
        res.append(expansion)
        for child in t:
            extract_subtree_expansions(child, res)
            
def nltk_pos_tag_chunk(pos_tags):
    chunker = get_nltk_pos_tag_based_chunker()
    parse_tree = chunker.parse(pos_tags)
    subtree_expansions = []
    for subt in parse_tree:
        extract_subtree_expansions(subt, subtree_expansions)
    return list(map(chunk_to_str, parse_tree)), subtree_expansions

def prepare_entry(text):
    tokens = []
    # Workaround because there re some docuemtns that are repitions of the same word which causes the regex chunker to hang
    prev_token = ''
    for t in tokenizer.tokenize(text):
        if t != prev_token:
            tokens.append(t)
    tagger_output = tagger.tag(tokens)
    pos_tags = [t[1] for t in tagger_output]
    pos_chunks, subtree_expansions = nltk_pos_tag_chunk(tagger_output)
    entry = {
        'preprocessed': text,
        'pos_tags': pos_tags,
        'pos_tag_chunks': pos_chunks,
        'pos_tag_chunk_subtrees': subtree_expansions,
        'tokens': tokens
    }
    return entry

def word_count(entry):
    return len(entry['tokens'])

def avg_chars_per_word(entry):
    r = np.mean([len(t) for t in entry['tokens']])
    return r

def distr_chars_per_word(entry, max_chars=10):
    counts = [0] * max_chars
    for t in entry['tokens']:
        l = len(t)
        if l <= max_chars:
            counts[l - 1] += 1
    r = [c/len(entry['tokens']) for c in counts]
#     fnames = ['distr_chars_per_word_' + str(i + 1)  for i in range(max_chars)]
    return r
    
def character_count(entry):
    r = len(re.sub('\s+', '', entry['preprocessed']))
    return r


#https://github.com/ashenoy95/writeprints-static/blob/master/whiteprints-static.py
def hapax_legomena(entry):
    freq = nltk.FreqDist(word for word in entry['tokens'])
    hapax = [key for key, val in freq.items() if val == 1]
    dis = [key for key, val in freq.items() if val == 2]
    if len(dis) == 0 or len(entry['tokens']) == 0:
        return 0
    return (len(hapax) / len(dis)) / len(entry['tokens'])


def pass_fn(x):
    return x

class CustomTfIdfTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, key, analyzer, n=1, vocab=None):
        self.key = key
        if self.key == 'pos_tags' or self.key == 'tokens' or self.key == 'pos_tag_chunks' or self.key == 'pos_tag_chunk_subtrees':
            self.vectorizer = TfidfVectorizer(analyzer=analyzer, min_df=0.1, tokenizer=pass_fn, preprocessor=pass_fn, vocabulary=vocab, norm='l1', ngram_range=(1, n))
        else:
            self.vectorizer = TfidfVectorizer(analyzer=analyzer, min_df=0.1, vocabulary=vocab, norm='l1', ngram_range=(1, n))

    def fit(self, x, y=None):
        self.vectorizer.fit([entry[self.key] for entry in x], y)
        return self

    def transform(self, x):
        return self.vectorizer.transform([entry[self.key] for entry in x])
    
    def get_feature_names(self):
        return self.vectorizer.get_feature_names()
    
    
class CustomFreqTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, analyzer, n=1, vocab=None):
        self.vectorizer = TfidfVectorizer(tokenizer=pass_fn, preprocessor=pass_fn, vocabulary=vocab, norm=None, ngram_range=(1, n))

    def fit(self, x, y=None):
        self.vectorizer.fit([entry['tokens'] for entry in x], y)
        return self

    def transform(self, x):
        d = np.array([1 + len(entry['tokens']) for entry in x])[:, None]
        return self.vectorizer.transform([entry['tokens'] for entry in x]) / d
    
    def get_feature_names(self):
        return self.vectorizer.get_feature_names()
    
    
class CustomFuncTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, transformer_func, fnames=None):
        self.transformer_func = transformer_func
        self.fnames = fnames
        
    def fit(self, x, y=None):
        return self;
    
    def transform(self, x):
        xx = np.array([self.transformer_func(entry) for entry in x])
        if len(xx.shape) == 1:
            return xx[:, None]
        else:
            return xx
    
    def get_feature_names(self):
        if self.fnames is None:
            return ['']
        else:
            return self.fnames
        
        
def get_writeprints_transformer():
    char_distr = CustomTfIdfTransformer('preprocessed', 'char_wb', n=6)
    word_distr = CustomTfIdfTransformer('preprocessed', 'word', n=3)
    pos_tag_distr = CustomTfIdfTransformer('pos_tags', 'word', n=3)
    pos_tag_chunks_distr = CustomTfIdfTransformer('pos_tag_chunks', 'word', n=3)
    pos_tag_chunks_subtree_distr = CustomTfIdfTransformer('pos_tag_chunk_subtrees', 'word', n=1)
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{¦}~'
    special_char_distr = CustomTfIdfTransformer('preprocessed', 'char_wb', vocab=punctuation)
    freq_func_words = CustomFreqTransformer('word', vocab=stopwords.words('english'))
    
    transformer = FeatureUnion([
        ('char_distr', char_distr),
        #('word_distr', word_distr),
        ('pos_tag_distr', pos_tag_distr),
        ('pos_tag_chunks_distr', pos_tag_chunks_distr),
        ('pos_tag_chunks_subtree_distr', pos_tag_chunks_subtree_distr),
        ('special_char_distr', special_char_distr),
        ('freq_func_words', freq_func_words),
        ('hapax_legomena', CustomFuncTransformer(hapax_legomena)),
        ('character_count', CustomFuncTransformer(character_count)),
        ('distr_chars_per_word', CustomFuncTransformer(distr_chars_per_word, fnames=[str(i) for i in range(10)])),
        ('avg_chars_per_word', CustomFuncTransformer(avg_chars_per_word)),
        ('word_count', CustomFuncTransformer(word_count))
    ])
    
    return transformer

In [None]:
import torch
import torch.nn as nn
from torch.utils import data

# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout_rate):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.act = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(self.batchnorm1(out))
        out = self.fc2(out)
        out = self.act(out)
        return out

In [None]:
# Load gound truth
ground_truth = {}
partition = {}
with open( PATH+'/dataset-strata-truth.jsonl', 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']
        r = np.random.rand() # Gives value between [0-1)
        if r < 0.7: # 70%
            partition[d['id']] = 'train'
        elif r < 0.85: # 15%
            partition[d['id']] = 'val'
        else: # 15%
            partition[d['id']] = 'test'

In [None]:
# Split datasets in to three sets and assign random order for train set
train_sz = len([v for v in partition.values() if v == 'train'])
test_sz = len([v for v in partition.values() if v == 'test'])
val_sz = len([v for v in partition.values() if v == 'val'])

In [None]:
with open(PATH + '/dataset-strata.jsonl', 'r') as f,\
    open( PATH + '/val_dataset.jsonl', 'w') as val:
    for l in tqdm(f, total=len(ground_truth)):
        d = json.loads(l)
        # e1 = prepare_entry(d['pair'][0])
        # e2 = prepare_entry(d['pair'][1])
        
        if partition[d['id']] == 'test':
            output_file = val
            json.dump({'id': d['id'], 'fandoms': d['fandoms'], 'pair': d['pair']}, output_file)
            output_file.write('\n')

HBox(children=(FloatProgress(value=0.0, max=1051.0), HTML(value='')))




In [None]:
with open(PATH + '/dataset-strata-truth.jsonl', 'r') as f,\
    open( PATH + '/op_ground_truth.jsonl', 'w') as val:
    for l in tqdm(f, total=len(ground_truth)):
        d = json.loads(l)
        # e1 = prepare_entry(d['pair'][0])
        # e2 = prepare_entry(d['pair'][1])
        
        if partition[d['id']] == 'test':
            output_file = val
            json.dump({'id': d['id'], 'same': d['same'], 'authors': d['authors']}, output_file)
            output_file.write('\n')

HBox(children=(FloatProgress(value=0.0, max=1051.0), HTML(value='')))




In [None]:
# Preprocess data and split to files
with open(PATH + '/dataset-strata.jsonl', 'r') as f,\
    open( PATH + '/process_docs_train.jsonl', 'w') as f_train,\
    open( PATH + '/process_docs_test.jsonl', 'w') as f_test,\
    open( PATH + '/process_docs_val.jsonl', 'w') as f_val:
    for l in tqdm(f, total=len(ground_truth)):
        d = json.loads(l)
       
        e1 = prepare_entry(d['pair'][0])
        e2 = prepare_entry(d['pair'][1])
        
        if partition[d['id']] == 'train':
            output_file = f_train
        elif partition[d['id']] == 'test':
            output_file = f_test
        elif partition[d['id']] == 'val':
            output_file = f_val
        else:
            raise('Invalid partition')
        json.dump({'id': d['id'], 'doc1': e1, 'doc2': e2}, output_file)
        output_file.write('\n')
    

HBox(children=(FloatProgress(value=0.0, max=1051.0), HTML(value='')))




In [None]:
with open(PATH + '/process_docs_train.jsonl', 'r') as f:
    docs = []
    for l in tqdm(f, total=train_sz):
#         Only retain a sample of records to train the transformer
        if np.random.rand() > 0.5:
            continue
            
        d = json.loads(l)
        docs.append(d['doc1'])
        docs.append(d['doc2'])

HBox(children=(FloatProgress(value=0.0, max=748.0), HTML(value='')))




In [None]:
print(docs[0])

{'preprocessed': 'But I KNOW, not think, that I don"t deserve Tohru. And it makes me painfully aware that Yuki needs her more that I do. I stopped in front of her door, to call her to dinner as I knocked my fist on the surface of the huge rectangular block of wood. "Dinner." I called out, knowing that she would understand what I meant. Immediately, I heard some sniffling, footsteps, and some more sniffling before the door before me was opened. "Ah, Kyou-kun," her sweet voice pronounced, hurriedly wiping her eyes with the back of her hand. I winced inwardly - her blue eyes were extremely bloodshot and tearstains can be definitely seen. Her hair was a mess, and her cheeks and nose were slightly pink, signaling that she really had been crying. "C"mon," I murmured to her, bringing my hand up to her face to wipe away a single tear. "Don"t cry, alright? Everything"s gonna be okay." I tried to comfort her with words, but that wasn"t my area. I brought my hand where it belonged, apparently bes

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
transformer = get_writeprints_transformer()
X = transformer.fit_transform(docs)
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)
with open(PATH + '/transformers.p', 'wb') as f:
    pickle.dump((transformer, scaler), f)


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [None]:
TEMP_DATA_DIR=PATH

In [None]:
train_idxs = np.array(range(train_sz))
np.random.shuffle(train_idxs)
#print(train_idxs.dtype)
with open(PATH + '/ordering_metadata.p', 'wb') as f:
    pickle.dump((train_sz, test_sz, val_sz, train_idxs), f)

In [None]:
print(train_idxs)

[312 169   0 273 648  29 379  47 653 417 236 622 440 346 698 335  35 186
  26 390 647 300 400  28 292 729 348 461 185 584 745 243 654 194 577  17
 707 681 560 276 472 734 674 523 274 485 209 307 691 723  73 326 561 102
 343 629 107 609  91 220 492 686 268 498 735 283 392 122 397 517 636 144
 282 153 407 695 230 668 148 619 398 507 702 108 677 244 353 576 580 232
 420 608 116 328 491 460 741 385 337 255 368 641 188 237 373 575 360  87
 133 323 717 528 227 193 469 406 109 123 631 113  39 633 513 104 222 324
 476   8 410  66 502 127  84 205  58 318 264 467 618 134  72 489 572 557
 314 378 539 313 549 546 401 733 154 238 201 520  33 404 490 511 315 416
 708 645 430 659 405 529 271 554 613 165 151 678 487 289 650 628 415 159
 607 571  94   2 699 419 331 213 627  44 658 679 199 342 488 454 591 480
 701 564 694 218 439 339 516 180 221 448 669 372  88 429 270 434 436 742
  22 515 189 217 675  55 198 555 393 664 569 586  71 350 203 506  64 187
 233 652 714 449 582 157 114 601  11 521  15 590 32

In [None]:
with open(PATH + '/transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)
    
with open(PATH + '/ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)

## Vectorizing Training Data

In [None]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(PATH + '/extracted_features_X_train.npy', dtype='float32', mode='w+', shape=(train_sz, feature_sz))
Y_train = np.memmap(PATH + '/extracted_features_Y_train.npy', dtype='int32', mode='w+', shape=(train_sz))

In [None]:
with open(PATH + '/process_docs_train.jsonl', 'r') as f:
    batch_size = 20000
    i = 0;
    docs1 = []
    docs2 = []
    idxs = []
    labels = []
    for l in tqdm(f, total=train_sz):
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(train_idxs[i])
        i += 1
        if len(labels) >= batch_size:
            x1 = scaler.transform(transformer.transform(docs1))
            x2 = scaler.transform(transformer.transform(docs2))
            X_train[idxs, :] = np.abs(x1-x2).todense()
            Y_train[idxs] = labels
            
            docs1 = []
            docs2 = []
            idxs = []
            labels = []
            
x1 = scaler.transform(transformer.transform(docs1))
x2 = scaler.transform(transformer.transform(docs2))
X_train[idxs, :] = np.abs(x1-x2).todense()
Y_train[idxs] = labels

HBox(children=(FloatProgress(value=0.0, max=748.0), HTML(value='')))




In [None]:
print(Y_train)

[1 0 1 0 0 0 1 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0
 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1
 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0
 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0
 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 0 0 0 0 1 1 0 1 1 0
 1 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1
 1 1 0 0 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1
 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 1
 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 1 0 1 0 1 1 1 1 0
 0 0 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1 1 0 0 0 0 1 0
 0 0 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1
 1 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1
 0 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 1 0 1 1
 0 1 0 1 1 0 1 1 1 1 1 1 

## Vectorizing Test Data

In [None]:
# feature_sz = len(transformer.get_feature_names())
X_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_test.npy', dtype='float32', mode='w+', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_test.npy', dtype='int32', mode='w+', shape=(test_sz))

with open(TEMP_DATA_DIR + '/process_docs_test.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    docs1 = []
    docs2 = []
    labels = []
    idxs = []
    for l in f:
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(i)
        i += 1          


x1 = scaler.transform(transformer.transform(docs1))
x2 = scaler.transform(transformer.transform(docs2))
X_test[idxs, :] = np.abs(x1-x2).todense()
Y_test[idxs] = labels

In [None]:
print(Y_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


## Vectorizing Val Data

In [None]:
feature_sz = len(transformer.get_feature_names())
X_val = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_val.npy', dtype='float32', mode='w+', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_val.npy', dtype='int32', mode='w+', shape=(val_sz))

with open(TEMP_DATA_DIR + '/process_docs_val.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    docs1 = []
    docs2 = []
    labels = []
    idxs = []
    for l in f:
        if i % 10000 == 0:
            print(i)
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(i)
        i += 1


x1 = scaler.transform(transformer.transform(docs1))
x2 = scaler.transform(transformer.transform(docs2))
X_val[idxs, :] = np.abs(x1-x2).todense()
Y_val[idxs] = labels

0


## Training SGD

In [None]:
with open(TEMP_DATA_DIR + '/ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + '/transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [None]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

In [None]:
X_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_test.npy', dtype='float32', mode='r+', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_test.npy', dtype='int32', mode='r+', shape=(test_sz))

In [None]:
clf = SGDClassifier(loss='log', verbose=True)

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [None]:
batch_size=1000 #1000
num_epochs = 200 #200
for i in range(num_epochs):
   # print('Epoch - ', i)
    print('-' * 30)
    for idxs in tqdm(batch(range(train_sz), batch_size), total=int(train_sz/batch_size) + 1):
        clf.partial_fit(X_train[idxs, :], Y_train[idxs], classes=[0, 1])
        
    probs = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    print('AUC: ', roc_auc)

-- Epoch 1
Norm: 349.67, NNZs: 36563, Bias: 135.233036, T: 748, Avg. loss: 0.000000
Total training time: 0.05 seconds.

AUC:  0.7698833510074231
------------------------------


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

-- Epoch 1
Norm: 347.93, NNZs: 36563, Bias: 135.233023, T: 748, Avg. loss: 0.000000
Total training time: 0.05 seconds.

AUC:  0.7698833510074231


In [None]:
print(Y_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [None]:
Y_pred = clf.predict(X_test)
print(Y_pred)

[1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1
 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0]


In [None]:
from sklearn import metrics
print('AUC: ', roc_auc)
print('Accuracy:', clf.score(X_test, Y_test))
metrics.confusion_matrix(Y_test, Y_pred)


AUC:  0.7698833510074231
Accuracy: 0.7682119205298014


array([[49, 20],
       [15, 67]])

## Metrics for SGD

In [None]:
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show(renderer="colab")
print(roc_auc)

precision, recall, thresholds = precision_recall_curve(Y_test, probs)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show(renderer="colab")
print('AUC: ', auc(recall, precision))

0.7698833510074231


AUC:  0.8391319224771813


## Train Classifier : PyTorchNN

In [None]:
with open(TEMP_DATA_DIR + '/ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + '/transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [None]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

X_val = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_val.npy', dtype='float32', mode='r+', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_val.npy', dtype='int32', mode='r+', shape=(val_sz))

X_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [None]:
# Device configuration
device = torch.device('cpu')

# Hyper-parameters 
input_size = X_train.shape[1]
hidden_size = 500
num_classes = 1
num_epochs = 20
batch_size = 1000
dropout_rate = 0.9
learning_rate = 0.0001

In [None]:
train_dataset = data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train.astype('float32')))
val_dataset = data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(Y_val.astype('float32')))
test_dataset = data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(Y_test.astype('float32')))
 
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)
 
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)
 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)
 
model = NeuralNet(input_size, hidden_size, num_classes, dropout_rate).to(device)
 
# Loss and optimizer 
# create a loss function
criterion = nn.BCELoss() #Creates a criterion that measures the Binary Cross Entropy between the target and the output
# create a stochastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #A method for stochastic optimisation





In [None]:
# Train the model
training_loss = []
validation_loss = []
aucs = []
total_step = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    running_training_loss = 0.0
    for i, (x, y) in enumerate(train_loader):  
        # Move tensors to the configured device
        x = x.to(device)
        y = y.to(device).unsqueeze(1)
        
        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, y)
        
        running_training_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad() #
        loss.backward()
        optimizer.step()
    
    training_loss.append(running_training_loss / len(train_loader))
    
    model.eval()
    # Validation loop
    running_val_loss = 0.0
    actual = []
    preds = []
    
    best_auc = 0.0
    best_model = model
    for X_batch, y_batch in val_loader:
        with torch.no_grad():
            y_pred = model(X_batch)
            y_batch = y_batch.to(device).unsqueeze(1)
            loss = criterion(y_pred, y_batch)
            running_val_loss += loss.item()
            preds.extend(list(y_pred.numpy()[:, 0]))
            actual.extend(list(y_batch.numpy()))
            
    fpr, tpr, thresh = roc_curve(actual, preds)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    if roc_auc > best_auc:
        best_auc = roc_auc
        best_model = copy.deepcopy(model)
    validation_loss.append(running_val_loss / len(val_loader))


    print ('Epoch [{}/{}], Training Loss: {:.4f}, Val Loss: {:.4f}, AUC: {:.4f}' 
           .format(epoch+1, num_epochs, running_training_loss / len(train_loader), running_val_loss / len(val_loader), roc_auc))

Epoch [1/20], Training Loss: 1.0352, Val Loss: 0.9328, AUC: 0.7424
Epoch [2/20], Training Loss: 0.7008, Val Loss: 1.5049, AUC: 0.8276
Epoch [3/20], Training Loss: 0.5284, Val Loss: 2.1840, AUC: 0.8668
Epoch [4/20], Training Loss: 0.4084, Val Loss: 2.8243, AUC: 0.8828
Epoch [5/20], Training Loss: 0.3066, Val Loss: 3.3644, AUC: 0.8934
Epoch [6/20], Training Loss: 0.2538, Val Loss: 3.7921, AUC: 0.9010
Epoch [7/20], Training Loss: 0.2165, Val Loss: 4.0996, AUC: 0.9064
Epoch [8/20], Training Loss: 0.2067, Val Loss: 4.2971, AUC: 0.9082
Epoch [9/20], Training Loss: 0.1583, Val Loss: 4.3841, AUC: 0.9090
Epoch [10/20], Training Loss: 0.1606, Val Loss: 4.3948, AUC: 0.9118
Epoch [11/20], Training Loss: 0.1326, Val Loss: 4.3397, AUC: 0.9128
Epoch [12/20], Training Loss: 0.1206, Val Loss: 4.2348, AUC: 0.9141
Epoch [13/20], Training Loss: 0.0992, Val Loss: 4.0883, AUC: 0.9151
Epoch [14/20], Training Loss: 0.1065, Val Loss: 3.9154, AUC: 0.9148
Epoch [15/20], Training Loss: 0.0814, Val Loss: 3.7291, A

In [None]:
model.eval()

NeuralNet(
  (fc1): Linear(in_features=36593, out_features=500, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=500, out_features=1, bias=True)
  (batchnorm1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.9, inplace=False)
  (act): Sigmoid()
)

In [None]:
from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = list(range(len(training_loss))),
    y = training_loss,
    mode='lines',
    name='Training Loss'
))
fig.add_trace(go.Scatter(
    x = list(range(len(validation_loss))),
    y = validation_loss,
    mode='lines',
    name='Validation Loss'
))
fig.show(renderer="colab")

In [None]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import TweetTokenizer

from tqdm import  tqdm
import json
import re
import os
import string
import argparse
import sys
import torch

TRANSFORMER_FILE = PATH+ '/transformers.p'
MODEL_FILE = '/content/drive/MyDrive/minor-project/best_model.pt'

class PANDatasetIterator(torch.utils.data.IterableDataset):

    def __init__(self, f_in, transformer, scaler):
        self.f_in = f_in
        self.transformer = transformer
        self.scaler = scaler

    def mapper(self, line):
        d = json.loads(line)
       
        x1 = scaler.transform(transformer.transform([prepare_entry(d['pair'][0])]).todense())
        x2 = scaler.transform(transformer.transform([prepare_entry(d['pair'][1])]).todense())
        x = np.abs(x1 - x2)[0, :].astype('float32')
        return x, d['id']
    

    def __iter__(self):
        f_itr = open(self.f_in, 'r')
        return map(self.mapper, f_itr)
    
class Args:
    i= PATH
    o= PATH


args=Args()
print(args.i)
    
    # validate:
if not args.i:
    raise ValueError('Eval dir path is required')
if not args.o:
    raise ValueError('Output dir path is required')
        
        
input_file = os.path.join(args.i, 'val_dataset.jsonl')
output_file = os.path.join(args.o, 'out.jsonl')
print("Writing answers to:", output_file , file=sys.stderr)
    
with open(TRANSFORMER_FILE, 'rb') as f:
    transformer, scaler = pickle.load(f)

with open(MODEL_FILE, 'rb') as f:
    best_model = torch.load(f)
    
device = torch.device('cpu')
ds = PANDatasetIterator(input_file, transformer, scaler)
test_loader = torch.utils.data.DataLoader(dataset=ds, batch_size=1000)

fout = open(output_file, 'a')
c = 0
with torch.no_grad():
    for x, ids in test_loader:
        x = x.to(device)
        outputs = best_model(x)
        probs = outputs.numpy()[:, 0].astype(float)

        for i in range(len(ids)):
            d = {
                'id': ids[i],
                'value': probs[i]
            }
            json.dump(d, fout)
            fout.write('\n')
        c += len(ids)
        print(c, file=sys.stderr)
        print('Written to', output_file, flush=True, file=sys.stderr)

fout.close()

In [None]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import TweetTokenizer

from tqdm import  tqdm
import json
import re
import os
import string
import argparse
import sys
import torch

TRANSFORMER_FILE = PATH+ '/transformers.p'
MODEL_FILE = '/content/drive/MyDrive/minor-project/best_model.pt'

class PANDatasetIterator(torch.utils.data.IterableDataset):

    def __init__(self, f_in, transformer, scaler):
        self.f_in = f_in
        self.transformer = transformer
        self.scaler = scaler

    def mapper(self, line):
        d = json.loads(line)
       
        x1 = scaler.transform(transformer.transform([prepare_entry(d['pair'][0])]).todense())
        x2 = scaler.transform(transformer.transform([prepare_entry(d['pair'][1])]).todense())
        x = np.abs(x1 - x2)[0, :].astype('float32')
        return x, d['id']
    

    def __iter__(self):
        f_itr = open(self.f_in, 'r')
        return map(self.mapper, f_itr)
    
class Args:
    i= PATH
    o= PATH


args=Args()
print(args.i)
    
    # validate:
if not args.i:
    raise ValueError('Eval dir path is required')
if not args.o:
    raise ValueError('Output dir path is required')
        
        
input_file = os.path.join(args.i, 'val_dataset.jsonl')
output_file = os.path.join(args.o, 'op_ground_truth.jsonl')
print("Writing answers to:", output_file , file=sys.stderr)
    
with open(TRANSFORMER_FILE, 'rb') as f:
    transformer, scaler = pickle.load(f)

with open(MODEL_FILE, 'rb') as f:
    best_model = torch.load(f)
    
device = torch.device('cpu')
ds = PANDatasetIterator(input_file, transformer, scaler)
test_loader = torch.utils.data.DataLoader(dataset=ds, batch_size=1000)

fout = open(output_file, 'a')
c = 0
with torch.no_grad():
    for x, ids in test_loader:
        x = x.to(device)
        print(x)
        # outputs = best_model(x)
        probs = x.numpy()[:, 0].astype(float)

        for i in range(len(ids)):
            d = {
                'id': ids[i],
                'value': probs[i]
            }
            json.dump(d, fout)
            fout.write('\n')
        c += len(ids)
        print(c, file=sys.stderr)
        print('Written to', output_file, flush=True, file=sys.stderr)

fout.close()

In [None]:
print(actual)
print(preds)
# confusion_matrix(actual, preds)

[array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=f

In [None]:
fpr, tpr, thresh = roc_curve(actual, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show(renderer="colab")
print(roc_auc)
 
 
print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(actual, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show(renderer="colab")
print('AUC: ', auc(recall, precision))

0.9145833333333333
P-R Curve


AUC:  0.9193317946373702


In [None]:
torch.save(best_model, TEMP_DATA_DIR + '/best_model.pt')

Logistic Regression

In [None]:
with open(TEMP_DATA_DIR + '/ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + '/transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [None]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, 100)) ## Instead of 100 it was feature_sz
Y_train = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

X_val = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_val.npy', dtype='float32', mode='r', shape=(val_sz, 100)) ## Instead of 100 it was feature_sz
Y_val = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_val.npy', dtype='int32', mode='r', shape=(val_sz))

X_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, 100)) ## Instead of 100 it was feature_sz
Y_test = np.memmap(TEMP_DATA_DIR + '/extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [None]:
X_train = np.array(X_train)

In [None]:
clf = LogisticRegression(solver='lbfgs', max_iter=50)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
param_clf = RandomizedSearchCV(clf, distributions, random_state=0, verbose=2, scoring='roc_auc')
search = param_clf.fit(X_train, Y_train)
search.best_params_

In [None]:
search.best_params_

{'C': 1.5337660753031108, 'penalty': 'l2'}

In [None]:
# clf = LogisticRegression(C=0.2, solver='lbfgs', max_iter=5000, verbose=True)
clf.fit(X_train, Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=True,
                   warm_start=False)

## Metrics of Logistic Regression

In [None]:
Y_pred = clf.predict(X_test)
metrics.confusion_matrix(Y_pred, Y_test)

array([[31, 30],
       [38, 52]])

In [None]:
preds = clf.predict_proba(X_test)[:, 1]
print('FPR-TRP Curve')

fpr, tpr, thresh = roc_curve(Y_test, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show(renderer="colab")
print(roc_auc)


print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(Y_test, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show(renderer="colab")
print('AUC: ', auc(recall, precision))

FPR-TRP Curve


0.5364086249558148
P-R Curve


AUC:  0.5849154250880106


In [None]:
Y_pred = clf.predict(X_test)
print(Y_pred)
print(Y_test)
score =accuracy_score(Y_test,Y_pred)
print("Accuracy: ",score)

[1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 0 1 0 1 0 1 0 1
 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0
 0 0 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0
 1 1 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 1 1 0 0
 0 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]
Accuracy:  0.5496688741721855


In [None]:
with open(TEMP_DATA_DIR + 'LiniearRegressionModal.p', 'wb') as f:
    pickle.dump(clf, f)