In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

import pandas as pd

from pathlib import Path

import pickle

import pt_core_news_sm
import itertools

In [19]:
# data_path = Path('../dataset/functions_classifier')
# train_data_path = data_path / 'train.jsonl'
# validation_data_path = data_path /'dev.jsonl'
# test_data_path = data_path /'test.jsonl'
validation_data_path = 'df_valid.jsonl'
train_data_path = 'df_train.jsonl'
validation_title_data_path = 'df_valid_title.jsonl'
train_title_data_path = 'df_train_title.jsonl'

In [20]:
train_df = pd.read_json(train_data_path, orient='records', lines=True)
dev_df = pd.read_json(validation_data_path, orient='records', lines=True)
train_title_df = pd.read_json(train_title_data_path, orient='records', lines=True)
dev_title_df = pd.read_json(validation_title_data_path, orient='records', lines=True)
#test_df = pd.read_json(test_data_path, orient='records', lines=True)

In [4]:
count_vect = CountVectorizer()

In [5]:
train_counts = count_vect.fit_transform(train_df['text'])
train_counts.shape

(7894, 94917)

In [6]:
tfidf_transformer = TfidfTransformer()

In [7]:
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(7894, 94917)

In [8]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', max_iter=1000, tol=1e-4, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_df['text'], train_df['label'])
predicted_svm = text_clf_svm.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svm)

0.9935949618446822

In [9]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 
              'tfidf__use_idf': (True, False), 
              'tfidf__norm': ('l1', 'l2'), 
              'clf-svm__alpha': (1e-2, 1e-3), 
              'clf-svm__penalty': ('none', 'l2', 'l1', 'elasticnet')}

In [10]:
gs_clf = GridSearchCV(text_clf_svm, parameters, scoring='balanced_accuracy', n_jobs=-1, cv=10, iid=True, verbose=True)
gs_clf = gs_clf.fit(train_df['text'], train_df['label'])
print(gs_clf.best_score_)
print(gs_clf.best_params_)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 30.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 77.5min
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed: 111.6min finished


0.8829313450804446
{'clf-svm__alpha': 0.001, 'clf-svm__penalty': 'none', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [14]:
len(gs_clf.predict(dev_df['text']))

4251

In [15]:
def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions for %s' % validation_data_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

In [18]:
write_predictions(text_clf_svm.predict(dev_df['text']), 'submissions_text_clf_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [17]:
write_predictions(gs_clf.predict(dev_df['text']), 'submissions_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [72]:
class DocumentsIterator(object):
    def __init__(self, dataframe, has_labels=True):
        """
        Um generator de documentos que produz os tokens filtrados a partir dos textos dos documentos.
        :param documents: Lista ou iterator de documentos.
        :param text_field: Nome do campo a partir do qual será extraído o texto dos documentos.
        """
        self.dataframe = dataframe
        self.has_labels = has_labels
        self.nlp = pt_core_news_sm.load()
        self.nlp.remove_pipe('parser')
        self.nlp.remove_pipe('ner')
        self.stopwords = [line.replace('\n', '') for line in open('stopwords-pt.txt', mode='r', encoding='utf8').readlines()]

    def get_relevant_tokens(self):
        """
        Tipos dos Part of Speech tags definidas pelo spacy para os tokens produzidos:
        ADJ (Adjetivo)
        ADP (Adposição - Preposições e Posposições) -> Desprezar
        ADV (Advérbio) -> Desprezar
        AUX (Auxiliar) -> Desprezar
        CCONJ (Conjunção coordenativa) -> Desprezar
        DET (Determinante - Artigos, Pronomes numerais, etc) -> Desprezar
        INTJ (Interjeição) -> Desprezar
        NOUN (Substantivo)
        NUM (Numeral) -> Desprezar
        PART (Partícula) -> Desprezar
        PRON (Pronome) -> Desprezar
        PROPN (Nome próprio)
        PUNCT (Pontuação) -> Desprezar
        SCONJ (Conjunção subordinativa) -> Desprezar
        SYM (Símbolo) -> Desprezar
        VERB (Verbo)
        X (Outros) -> Desprezar
        :return: Generator de lemmas dos tokens, convertidos para lowercase, filtrados de acordo com as POS tags e
        tamanho.
        """
        print('pre-processing the documents')
        if self.has_labels:
            labels = self.dataframe['label']
        texts = self.dataframe['text']
        docs = self.nlp.pipe(self.dataframe['text'], n_threads=4, batch_size=100)
        for index, doc in enumerate(docs):
            doc = [token.lemma_.lower() for token in doc if token.is_alpha and not token.text.lower() in self.stopwords]
            if self.has_labels:
                label = labels[index] if self.has_labels else None
                # Recupera os lemmas para lowercase dos tokens que não forem stopwords
                yield {'label': label, 'text': doc}
            else:
                yield {'text': doc}

    def __iter__(self):
        return iter(self.get_relevant_tokens())

In [81]:
def get_processed_df(dataframe, name_json, has_labels=True):
    iterator = DocumentsIterator(dataframe, has_labels)
    count = 0
    texts = []
    if has_labels:
        labels = []
    for doc in iterator:
        if has_labels:
            labels.append(doc['label'])
        texts.append(' '.join(doc['text']))
        count += 1
        if count % 500 == 0:
            print('Processed %d documents' % count)
    if has_labels:
        processed_df = pd.DataFrame({'label': labels, 'text': texts})
    else:
        processed_df = pd.DataFrame({'text': texts})
    processed_df.to_json(name_json, orient='records', lines=True)
    return processed_df

In [82]:
train_processed_df = get_processed_df(train_df, 'df_train_lemma_no_stop.jsonl')

pre-processing the documents
Processed 500 documents
Processed 1000 documents
Processed 1500 documents
Processed 2000 documents
Processed 2500 documents
Processed 3000 documents
Processed 3500 documents
Processed 4000 documents
Processed 4500 documents
Processed 5000 documents
Processed 5500 documents
Processed 6000 documents
Processed 6500 documents
Processed 7000 documents
Processed 7500 documents


In [88]:
train_processed_df = pd.read_json('df_train_lemma_no_stop.jsonl', orient='records', lines=True)
train_processed_df

Unnamed: 0,label,text
0,comida,casa barra funda clima roceiro receitar saboro...
1,educacao,professores sp decidir manter greve fechar pis...
2,empreendedorsocial,edição concurso pago pequeno empresar prêmio f...
3,equilibrioesaude,maconha saudar físico estudar esforçar entende...
4,ciencia,percorrer km revelar cientista roto migração i...
5,empreendedorsocial,líderes inovador reunem redar compartilhar ide...
6,turismo,conheça pandora atração milionário disney insp...
7,empreendedorsocial,fiesp organizar edição maratona hacker atenção...
8,turismo,praia forte misturar natureza resorts estrutur...
9,turismo,app agência avisar vestir passaporte vencer ap...


In [85]:
valid_processed_df = get_processed_df(dev_df, 'df_valid_lemma_no_stop.jsonl', has_labels=False)

pre-processing the documents
Processed 500 documents
Processed 1000 documents
Processed 1500 documents
Processed 2000 documents
Processed 2500 documents
Processed 3000 documents
Processed 3500 documents
Processed 4000 documents


In [90]:
valid_processed_df = pd.read_json('df_valid_lemma_no_stop.jsonl', orient='records', lines=True)
valid_processed_df

Unnamed: 0,text
0,vitrine dilma pronatec orçamentar compassar es...
1,direito autoral publicidade youtubers reclamar...
2,rótulos alimento alertar lactose decidir anvis...
3,sociedade britânico compositor processar sound...
4,fies aluno madrugar portar fmu conseguir datar...
5,cientistas aguardar nascimento raro dragão cav...
6,aplicativo mostrar voar turismo preparar seleç...
7,pesquisas indicar sono direção perigoso álcool...
8,brasil lançar missão lua estudar vidar espaçar...
9,cientistas testar vacinar colesterol doença ca...


In [96]:
processed_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                              ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])

processed_clf_svm = processed_clf_svm.fit(train_df['text'], train_df['label'])
predicted_svm = processed_clf_svm.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svm)

0.993320976411534

In [98]:
write_predictions(processed_clf_svm.predict(dev_df['text']), 'submissions_stop_clf_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [16]:
model_path = '/media/discoD/models/scikit-learn/functions/judge_classifier.pkl'

In [17]:
# Save to file in the current working directory
with open(model_path, 'wb') as file:  
    pickle.dump(gs_clf, file)

# Load from file
with open(model_path, 'rb') as file:  
    pickle_model = pickle.load(file)

# Calculate the accuracy score and predict target values
prediction = pickle_model.predict(test_df['text'])
print("Test score: {0:.4f} %".format(100 * balanced_accuracy_score(test_df['label'], prediction)))  

Test score: 96.4640 %


In [18]:
pickle_model.predict(['Servidor Responsável'])

array([0])

In [22]:
pickle_model.predict(['Assistente de Juiz', 'Assistente do Juiz', 'Juiz Assistente', 'Juiz Substituto'])

array([0, 0, 0, 1])