In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

import pandas as pd

from pathlib import Path

import pickle

In [2]:
# data_path = Path('../dataset/functions_classifier')
# train_data_path = data_path / 'train.jsonl'
# validation_data_path = data_path /'dev.jsonl'
# test_data_path = data_path /'test.jsonl'
test_data_path = 'test_imdb.csv'
train_data_path = 'train_imdb.csv'

In [3]:
train_df = pd.read_csv(train_data_path, header=None, names=['label','text'])
test_df = pd.read_csv(test_data_path, header=None, names=['label','text'])
#test_df = pd.read_json(test_data_path, orient='records', lines=True)

In [4]:
train_df

Unnamed: 0,label,text
0,0,**1/2 for this Diane Keaton farce.<br /><br />...
1,1,This episode apparently grew out of the cold w...
2,0,"I haven't read a biography of Lincoln, so mayb..."
3,1,I was young film student in 1979 when the Unio...
4,0,"""Black Angel"" is minor whodunit, with June Vin..."
5,0,I didn't agree with any of the theology in the...
6,0,Dear me... Peter Sellers was one of the most o...
7,0,This has to be one of the top overrated anime ...
8,0,This was the worst movie I have ever seen and ...
9,0,I'm not sure what HK movies the other reviewer...


In [5]:
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words('english'))

In [6]:
count_vect = CountVectorizer()

In [7]:
train_counts = count_vect.fit_transform(train_df['text'])
train_counts.shape

(25000, 74849)

In [8]:
tfidf_transformer = TfidfTransformer()

In [9]:
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(25000, 74849)

In [10]:
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords_en)), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', max_iter=1000, tol=1e-4, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_df['text'], train_df['label'])
predicted_svm = text_clf_svm.predict(test_df['text'])
balanced_accuracy_score(test_df['label'], predicted_svm)

0.8835200000000001

In [9]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 
              'tfidf__use_idf': (True, False), 
              'tfidf__norm': ('l1', 'l2'), 
              'clf-svm__alpha': (1e-2, 1e-3), 
              'clf-svm__penalty': ('none', 'l2', 'l1', 'elasticnet')}

In [10]:
gs_clf = GridSearchCV(text_clf_svm, parameters, scoring='balanced_accuracy', n_jobs=-1, cv=10, iid=True, verbose=True)
gs_clf = gs_clf.fit(train_df['text'], train_df['label'])
print(gs_clf.best_score_)
print(gs_clf.best_params_)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 30.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 77.5min
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed: 111.6min finished


0.8829313450804446
{'clf-svm__alpha': 0.001, 'clf-svm__penalty': 'none', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [14]:
len(gs_clf.predict(dev_df['text']))

4251

In [15]:
def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions for %s' % validation_data_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

In [18]:
write_predictions(text_clf_svm.predict(dev_df['text']), 'submissions_text_clf_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [17]:
write_predictions(gs_clf.predict(dev_df['text']), 'submissions_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [15]:
predicted_gs_clf = gs_clf.predict(test_df['text'])
balanced_accuracy_score(test_df['label'], predicted_gs_clf)

0.9646401985111663

In [16]:
model_path = '/media/discoD/models/scikit-learn/functions/judge_classifier.pkl'

In [17]:
# Save to file in the current working directory
with open(model_path, 'wb') as file:  
    pickle.dump(gs_clf, file)

# Load from file
with open(model_path, 'rb') as file:  
    pickle_model = pickle.load(file)

# Calculate the accuracy score and predict target values
prediction = pickle_model.predict(test_df['text'])
print("Test score: {0:.4f} %".format(100 * balanced_accuracy_score(test_df['label'], prediction)))  

Test score: 96.4640 %


In [18]:
pickle_model.predict(['Servidor Responsável'])

array([0])

In [22]:
pickle_model.predict(['Assistente de Juiz', 'Assistente do Juiz', 'Juiz Assistente', 'Juiz Substituto'])

array([0, 0, 0, 1])