In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

import pandas as pd

from pathlib import Path

import pickle

import pt_core_news_sm
import itertools

In [2]:
# data_path = Path('../dataset/functions_classifier')
# train_data_path = data_path / 'train.jsonl'
# validation_data_path = data_path /'dev.jsonl'
# test_data_path = data_path /'test.jsonl'
validation_data_path = 'df_valid.jsonl'
train_data_path = 'df_train.jsonl'
validation_title_data_path = 'df_valid_title.jsonl'
train_title_data_path = 'df_train_title.jsonl'

In [3]:
train_df = pd.read_json(train_data_path, orient='records', lines=True)
dev_df = pd.read_json(validation_data_path, orient='records', lines=True)
train_title_df = pd.read_json(train_title_data_path, orient='records', lines=True)
dev_title_df = pd.read_json(validation_title_data_path, orient='records', lines=True)
#test_df = pd.read_json(test_data_path, orient='records', lines=True)

In [4]:
count_vect = CountVectorizer()

In [5]:
train_counts = count_vect.fit_transform(train_df['text'])
train_counts.shape

(7894, 94918)

In [6]:
tfidf_transformer = TfidfTransformer()

In [7]:
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(7894, 94918)

In [8]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', max_iter=1000, tol=1e-4, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_df['text'], train_df['label'])
predicted_svm = text_clf_svm.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svm)

0.9935949618446822

In [9]:
def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions for %s' % validation_data_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

In [10]:
write_predictions(text_clf_svm.predict(dev_df['text']), 'submissions_text_clf_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [12]:
stopwords = [line.replace('\n', '') for line in open('stopwords-pt.txt', mode='r', encoding='utf8').readlines()]

In [13]:
processed_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                              ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])

processed_clf_svm = processed_clf_svm.fit(train_df['text'], train_df['label'])
predicted_svm = processed_clf_svm.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svm)

0.9945251613575733

In [14]:
write_predictions(processed_clf_svm.predict(dev_df['text']), 'submissions_stop_clf_svm.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [16]:
processed_clf_svc = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                              ('clf-svm', LinearSVC(max_iter=2000, tol=1e-5, random_state=42))])

processed_clf_svc = processed_clf_svc.fit(train_df['text'], train_df['label'])
predicted_svc = processed_clf_svc.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svc)

0.998515118784651

In [17]:
write_predictions(processed_clf_svc.predict(dev_df['text']), 'submissions_stop_clf_svc.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [20]:
processed_clf_svc = Pipeline([('tfidf_vect', TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), stop_words=stopwords)),
                              ('clf-svm', LinearSVC(max_iter=2000, tol=1e-5, random_state=42))])

processed_clf_svc = processed_clf_svc.fit(train_df['text'], train_df['label'])
predicted_svc = processed_clf_svc.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svc)

0.9987754918365435

In [21]:
write_predictions(processed_clf_svc.predict(dev_df['text']), 'submissions_linear_svc_ngram.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [23]:
processed_clf_svc = Pipeline([('tfidf_vect', TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), stop_words=stopwords)),
                              ('clf-svm', LinearSVC(max_iter=2000, tol=1e-5, random_state=42))])

processed_clf_svc = processed_clf_svc.fit(train_df['text'], train_df['label'])
predicted_svc = processed_clf_svc.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svc)

0.998515118784651