In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, f1_score

import pandas as pd

from pathlib import Path

In [2]:
stopwords = [word.strip() for word in open('stopwords-pt.txt', mode='r', encoding='utf8')]

In [3]:
def append_title_text(row):
    connector = '. ' if not row['title'].endswith('.') else ' '
    return row['title'].strip() + connector + row['text'].strip()

def load_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    raw_df['text'] = raw_df.apply(lambda row: append_title_text(row), axis=1)
    del raw_df['title']
    if train:
        raw_df = raw_df.rename(columns={'category': 'label'})
    else:
        del raw_df['Unnamed: 0']
    return raw_df

def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions to %s' % out_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())
    
def train_svm_model(train_df, dev_df, submission_name):
    processed_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                                  ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])

    processed_clf_svm = processed_clf_svm.fit(train_df['text'], train_df['label'])
    predicted_svm = processed_clf_svm.predict(train_df['text'])
    print(balanced_accuracy_score(train_df['label'], predicted_svm))
    write_predictions(processed_clf_svm.predict(dev_df['text']), 'submissions_' + submission_name + '.csv')

In [4]:
train_df = load_dataset('df_train.csv', train=True)
train_df

Unnamed: 0,text,label
0,Casa da Barra Funda tem clima roceiro e receit...,comida
1,Professores de SP decidem manter greve; grupo ...,educacao
2,"Em segunda edição, concurso paga R$ 35 mil par...",empreendedorsocial
3,Usar maconha por anos não faz tão mal para a s...,equilibrioesaude
4,Baleia-azul percorre 5.200 km e revela a cient...,ciencia
5,Líderes inovadores se reunem em rede para comp...,empreendedorsocial
6,"Conheça Pandora, atração milionária da Disney ...",turismo
7,Fiesp organiza quarta edição de maratona hacke...,empreendedorsocial
8,Praia do Forte mistura natureza com resorts es...,turismo
9,App de agência avisa quando visto e passaporte...,turismo


In [5]:
dev_df = load_dataset('df_valid.csv', train=False)
dev_df

Unnamed: 0,text
0,"Vitrine de Dilma, Pronatec terá orçamento 65% ..."
1,"Por direitos autorais e publicidade, 'youtuber..."
2,Rótulos de alimentos terão que alertar sobre l...
3,Sociedade britânica de compositores processa S...
4,"Por Fies, aluna madruga na porta da FMU, mas s..."
5,Cientistas aguardam nascimento raro de 'dragõe...
6,Aplicativo mostra quem está no mesmo voo que v...
7,Pesquisas indicam que sono na direção pode ser...
8,Brasil lançará missão à Lua até 2020 para estu...
9,Cientistas testam vacina contra colesterol e d...


In [6]:
train_svm_model(train_df, dev_df, 'stop_clf_svm')

0.9943942884351809
Saving predictions to submissions_stop_clf_svm.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800

In [7]:
true = pd.read_csv('result.csv')['category']
predicted = pd.read_csv('submissions_stop_clf_svm.csv')['category']
f1_score(true, predicted, average='micro')

0.9183721477299459