In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

import pandas as pd

from pathlib import Path

import pickle

In [2]:
data_path = Path('../dataset/functions_classifier')
train_data_path = data_path / 'train.jsonl'
validation_data_path = data_path /'dev.jsonl'
test_data_path = data_path /'test.jsonl'

In [3]:
train_df = pd.read_json(train_data_path, orient='records', lines=True)
dev_df = pd.read_json(validation_data_path, orient='records', lines=True)
test_df = pd.read_json(test_data_path, orient='records', lines=True)

In [4]:
count_vect = CountVectorizer()

In [5]:
train_counts = count_vect.fit_transform(train_df['text'])
train_counts.shape

(268, 159)

In [6]:
tfidf_transformer = TfidfTransformer()

In [7]:
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(268, 159)

In [8]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', max_iter=1000, tol=1e-4, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_df['text'], train_df['label'])
predicted_svm = text_clf_svm.predict(test_df['text'])
balanced_accuracy_score(test_df['label'], predicted_svm)

0.9646401985111663

In [9]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 
              'tfidf__use_idf': (True, False), 
              'tfidf__norm': ('l1', 'l2'), 
              'clf-svm__alpha': (1e-2, 1e-3), 
              'clf-svm__penalty': ('none', 'l2', 'l1', 'elasticnet')}

In [10]:
gs_clf = GridSearchCV(text_clf_svm, parameters, scoring='balanced_accuracy', n_jobs=-1, cv=10, iid=True, verbose=True)
gs_clf = gs_clf.fit(train_df['text'], train_df['label'])
print(gs_clf.best_score_)
print(gs_clf.best_params_)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s


0.9640456640593823
{'clf-svm__alpha': 0.001, 'clf-svm__penalty': 'l2', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:    2.4s finished


In [11]:
svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svc', SVC(probability=True))])
svm_param_grid = {'vect__ngram_range': [(1, 1), (1, 2)], 
                  'tfidf__use_idf': (True, False), 
                  'tfidf__norm': ('l1', 'l2'), 
                  'clf-svc__kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 
#                   'gamma': [ 0.001, 0.01, 0.1, 1],
                  'clf-svc__C': [1, 10, 50, 100,200,300, 1000]}

In [12]:
grid_svm = GridSearchCV(svm, param_grid = svm_param_grid, scoring='roc_auc', iid=True, cv=10, n_jobs=-1, verbose = True)
grid_svm = grid_svm.fit(train_df['text'], train_df['label'])
print(grid_svm.best_score_)
print(grid_svm.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 224 candidates, totalling 2240 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 1696 tasks      | elapsed:    5.8s


0.9902847892888498
{'clf-svc__C': 1, 'clf-svc__kernel': 'poly', 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


[Parallel(n_jobs=-1)]: Done 2240 out of 2240 | elapsed:    7.5s finished


In [13]:
grid_svm_acc = GridSearchCV(svm, param_grid = svm_param_grid, scoring='balanced_accuracy', iid=True, cv=10, n_jobs=-1, verbose = True)
grid_svm_acc = grid_svm_acc.fit(train_df['text'], train_df['label'])
print(grid_svm_acc.best_score_)
print(grid_svm_acc.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 224 candidates, totalling 2240 fits


[Parallel(n_jobs=-1)]: Done 296 tasks      | elapsed:    1.2s


0.9640456640593823
{'clf-svc__C': 10, 'clf-svc__kernel': 'linear', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


[Parallel(n_jobs=-1)]: Done 2240 out of 2240 | elapsed:    8.2s finished


In [14]:
gs_clf = gs_clf.fit(pd.concat([train_df['text'], dev_df['text']]), pd.concat([train_df['label'], dev_df['label']]))
print(gs_clf.best_score_)
print(gs_clf.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 64 candidates, totalling 640 fits
0.9677008200975811
{'clf-svm__alpha': 0.001, 'clf-svm__penalty': 'none', 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:    1.2s finished


In [15]:
predicted_gs_clf = gs_clf.predict(test_df['text'])
balanced_accuracy_score(test_df['label'], predicted_gs_clf)

0.9646401985111663

In [16]:
model_path = '/media/discoD/models/scikit-learn/functions/judge_classifier.pkl'

In [17]:
# Save to file in the current working directory
with open(model_path, 'wb') as file:  
    pickle.dump(gs_clf, file)

# Load from file
with open(model_path, 'rb') as file:  
    pickle_model = pickle.load(file)

# Calculate the accuracy score and predict target values
prediction = pickle_model.predict(test_df['text'])
print("Test score: {0:.4f} %".format(100 * balanced_accuracy_score(test_df['label'], prediction)))  

Test score: 96.4640 %


In [18]:
pickle_model.predict(['Servidor Responsável'])

array([0])

In [22]:
pickle_model.predict(['Assistente de Juiz', 'Assistente do Juiz', 'Juiz Assistente', 'Juiz Substituto'])

array([0, 0, 0, 1])