In [116]:
import pandas as pd

In [117]:
DATA = 'ech_dataset'

In [118]:
articles = !ls echr_dataset

In [119]:
for article in articles:
    path = f'{DATA}/{article}'

In [170]:
!ls echr_dataset/Article6/

cases_a6.csv		     ngrams_a6_full.csv       ngrams_a6_relevantLaw.csv
ngrams_a6_circumstances.csv  ngrams_a6_law.csv	      topics6.csv
ngrams_a6_featureNames.txt   ngrams_a6_procedure.csv  topics6_vocab.txt


In [243]:
def get_article(path, features):
    files_raw = !ls {path}
    files_features = ['cases', 'circumstances', 'featureNames', 'full', 'law', 'procedure', 'relevantLaw', 'topics', 'topicsVocab']
    files = dict(zip(files_features, files_raw))
    
    group = []
    for feature in features:
        if(feature == 'topics'): raw = pd.read_csv(path+files[feature], sep='\t', header=None)
        elif(feature == 'topicsVocab'): raw = open(path+files[feature]).read(); raw.replace('\n', '').replace(', ', ',').split(',')
        else: raw = pd.read_csv(path+files[feature], header=None)
        
        if feature == 'cases': raw = raw[1]
            
        group.append(raw)
    return pd.concat(group, axis=1, ignore_index=True)

In [254]:
!ls echr_dataset

Article3  Article6  Article8


In [263]:
data = get_article('echr_dataset/Article8/', ['cases', 'full'])

In [264]:
X = data.iloc[:, 1:]

In [301]:
y = data[0]

In [273]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0.004009,0.0,0.0,0.0,0.001145,0.0,0.0,0.0,0.000573,0.004582,...,0.0,0.0,0.0,0.0,0.000573,0.002291,0.0,0.001145,0.0,0.0
1,0.000156,0.0,0.0,0.000156,0.000781,0.0,0.0,0.0,0.0,0.001093,...,0.010303,0.000312,0.000312,0.0,0.0,0.000468,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000354,0.0,0.0,0.0,0.0,0.001772,0.000709,...,0.000354,0.0,0.000354,0.0,0.0,0.002481,0.000709,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005333,...,0.0,0.0,0.0,0.0,0.0,0.003556,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [266]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing

import numpy as np

In [267]:
param_grid = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
]

In [268]:
clf = make_pipeline(preprocessing.StandardScaler(), 
                    GridSearchCV(SVC(),
                                 param_grid=param_grid, cv=10, refit=True))

In [269]:
scores = cross_val_score(clf, X, y, cv=10)

In [270]:
np.mean(scores)

0.7115384615384616

In [272]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim

In [326]:
ITERS = 1000
max_features = 50
batch_size = 50

In [327]:
class BIGRU(nn.Module):
    def __init__(self):
        super(BIGRU, self).__init__()

        self.embedding = nn.Embedding(max_features, 128)
        self.gru = nn.GRU(128, 64, num_layers=1, bidirectional=True)
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(128, 1)

    def forward(self, x, h):
        x = self.embedding(x)
        x, h = self.gru(x, h)
        x = self.dropout(x[:,-1,:].squeeze())
        x = F.sigmoid(self.linear(x))
        return x, h

    def init_hidden(self):
        return autograd.Variable(torch.randn(2, batch_size, 64))

In [328]:
model = BIGRU()

In [329]:
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [330]:
num_batch_epoch = len(X) // batch_size

In [331]:
h = model.init_hidden()

In [332]:
model.train()

BIGRU(
  (embedding): Embedding(50, 128)
  (gru): GRU(128, 64, bidirectional=True)
  (dropout): Dropout(p=0.5)
  (linear): Linear(in_features=128, out_features=1, bias=True)
)

In [333]:
import time

In [334]:
y = np.array([1 if i == 'v' else 0  for i in y])

In [335]:
for e in range(ITERS):
    print('\n' + 'Epoch {}/{}'.format(e, ITERS))
    print('-' * 10)
    start = time.time()

    idx = np.random.permutation(len(X))
    iter_loss = 0.
    iter_correct = 0.
    for b in range(len(X) // batch_size):
        x_batch = torch.from_numpy(X.as_matrix()[idx[b*batch_size:(b+1)*batch_size]]).long()
        y_batch = torch.from_numpy(y[idx[b*batch_size:(b+1)*batch_size]]).float()

        h.detach_()
        y_pred, h = model(x_batch, h)

        optimizer.zero_grad()

        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        
        optimizer.step()

        trn_preds = torch.round(y_pred.data)
        iter_correct += torch.sum(trn_preds == y_batch.data)
        iter_loss += loss.data[0]

    print('Training Loss: {:.3} | Training Acc: {:.3}'.format(iter_loss / num_batch_epoch, float(iter_correct) / num_batch_epoch))
    print('Time: {}'.format(time.time()-start))


Epoch 0/1000
----------


  # Remove the CWD from sys.path while we load stuff.


RuntimeError: Expected hidden size (2, 2000, 64), got (2, 50, 64)