In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from utils.nbsvm import NBSVM
from sklearn.model_selection import cross_val_score

In [2]:
path = 'hudoc/Article3'

In [3]:
data = []

folders = !ls {path}
for idx, folder in enumerate(folders):
    files = !ls {path}/{folder}
    for file in files:
        data.append({'id': file[:-4], 'text': open(f'{path}/{folder}/{file}').read(), 'label': idx})

In [4]:
df = pd.DataFrame(data=data)

In [5]:
df = shuffle(df)

In [6]:
df['text'][0]

"FIRST SECTION      CASE OF BENUYEVA AND OTHERS v. RUSSIA (Application no. 8347/05)        JUDGMENT   STRASBOURG  22 July 2010 FINAL 22/11/2010 This judgment has become final under Article 44 § 2 of the Convention. It may be subject to editorial revision.In the case of Benuyeva and Others v. Russia,The European Court of Human Rights (First Section), sitting as a Chamber composed of:Christos Rozakis, President,Anatoly Kovler,Elisabeth Steiner,Dean Spielmann,Sverre Erik Jebens,Giorgio Malinverni,George Nicolaou, judges,and Søren Nielsen, Section Registrar,Having deliberated in private on 1 July 2010,Delivers the following judgment, which was adopted on that date:PROCEDURE1.\nThe case originated in an application (no. 8347/05) against the Russian Federation lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by fifteen Russian nationals listed below (“the applicants”), on 25 February 2005.2.\nThe applicant

In [7]:
lens = df.text.str.len()
lens.mean(), lens.std(), lens.max()

(54409.32914738929, 68524.50361557271, 1028496)

In [8]:
df.to_csv('Article3.csv')

In [10]:
#df.read_csv('Article3.csv')

In [11]:
lens.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f97dae76ac8>

In [12]:
df.head()

Unnamed: 0,id,label,text
464,001-61940,0,FOURTH SECTION CASE OF İKİNCİSOY v. TURKEY (...
1005,001-146388,1,FIRST SECTION CASE OF MAKAYEVA v. RUSSIA...
100,001-111177,0,FIRST SECTION CASE OF SHAKUROV v. RUSSIA ...
1091,001-152259,1,FIRST SECTION CASE OF M.S. v. CROATIA (N...
101,001-111178,0,FIRST SECTION CASE OF KOZHAYEV v. RUSSIA ...


In [13]:
label_cols = ['label']
df['none'] = 1-df[label_cols].max(axis=1)
df.describe()

Unnamed: 0,label,none
count,1513.0,1513.0
mean,0.543291,0.456709
std,0.498287,0.498287
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [14]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [15]:
train, test = train_test_split(df, test_size=0.2)

In [16]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,4), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=100000 )
trn_term_doc = vec.fit_transform(train['text'])
test_term_doc = vec.transform(test['text'])

In [17]:
trn_term_doc, test_term_doc

(<1210x100000 sparse matrix of type '<class 'numpy.float64'>'
 	with 10639972 stored elements in Compressed Sparse Row format>,
 <303x100000 sparse matrix of type '<class 'numpy.float64'>'
 	with 2612560 stored elements in Compressed Sparse Row format>)

In [18]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [19]:
x = trn_term_doc
test_x = test_term_doc

In [20]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=50, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [21]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit label


In [22]:
preds_y = (preds>0.5) * 1

In [23]:
test_y = test['label'].as_matrix()

  """Entry point for launching an IPython kernel.


In [24]:
preds_y.shape

(303, 1)

In [25]:
test_y = test_y.reshape(-1, 1)

In [26]:
sum((preds_y == test_y) * 1)/len(test_y)

array([0.75247525])

In [27]:
clf = NBSVM(C=1000, alpha=0.00001, beta=0.25)

In [28]:
y_train = train['label'].as_matrix()

  """Entry point for launching an IPython kernel.


In [29]:
X_train = trn_term_doc

In [30]:
X_train.reshape(1210, -1)

<1210x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 10639972 stored elements in Compressed Sparse Row format>

In [None]:
score = np.mean(cross_val_score(clf, X_train, y_train, cv=10))