In [28]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from nltk.tokenize import word_tokenize
import re
from sklearn.metrics import balanced_accuracy_score, f1_score
from pathlib import Path

In [2]:
stop_words = [word.strip() for word in open('stopwords-pt.txt', mode='r', encoding='utf8')]

In [23]:
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]
    

def Tokenizer(str_input):
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

def get_total_words(row):
    return len(word_tokenize(row['text']))

def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions to %s' % out_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

In [30]:
stop_words.extend(Tokenizer(' '.join(stop_words)))
stop_words = sorted(set(stop_words))
stop_words

['a',
 'acerca',
 'adeu',
 'adeus',
 'agora',
 'ainda',
 'al',
 'alem',
 'algma',
 'algmas',
 'algo',
 'alguma',
 'algumas',
 'algun',
 'alguns',
 'ali',
 'além',
 'amba',
 'ambas',
 'ambo',
 'ambos',
 'amo',
 'ano',
 'anos',
 'ant',
 'antes',
 'ao',
 'aond',
 'aonde',
 'aos',
 'ap',
 'apena',
 'apenas',
 'apo',
 'apoio',
 'apontar',
 'apos',
 'após',
 'aquel',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aqui',
 'aquilo',
 'as',
 'assim',
 'at',
 'atr',
 'atrav',
 'através',
 'atrás',
 'até',
 'aí',
 'baixo',
 'bastant',
 'bastante',
 'bem',
 'boa',
 'boas',
 'bom',
 'bon',
 'bons',
 'breve',
 'c',
 'cada',
 'caminho',
 'catorz',
 'catorze',
 'cedo',
 'cento',
 'certament',
 'certamente',
 'certeza',
 'cima',
 'cinco',
 'coisa',
 'com',
 'como',
 'comprido',
 'conhecido',
 'conselho',
 'contra',
 'contudo',
 'corrent',
 'corrente',
 'cuja',
 'cujas',
 'cujo',
 'cujos',
 'custa',
 'cá',
 'd',
 'da',
 'daquel',
 'daquela',
 'daquelas',
 'daquele',
 'daqueles',
 'dar',
 'das',
 'de',


In [15]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('text')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('TotalWords')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
    ])

In [11]:
train_df = pd.read_csv('train_text_label.csv')
train_df

Unnamed: 0,text,label
0,Casa da Barra Funda tem clima roceiro e receit...,comida
1,Professores de SP decidem manter greve; grupo ...,educacao
2,"Em segunda edição, concurso paga R$ 35 mil par...",empreendedorsocial
3,Usar maconha por anos não faz tão mal para a s...,equilibrioesaude
4,Baleia-azul percorre 5.200 km e revela a cient...,ciencia
5,Líderes inovadores se reunem em rede para comp...,empreendedorsocial
6,"Conheça Pandora, atração milionária da Disney ...",turismo
7,Fiesp organiza quarta edição de maratona hacke...,empreendedorsocial
8,Praia do Forte mistura natureza com resorts es...,turismo
9,App de agência avisa quando visto e passaporte...,turismo


In [14]:
train_df['TotalWords'] = train_df.apply(lambda row: get_total_words(row), axis=1)
train_df

Unnamed: 0,text,label,TotalWords
0,Casa da Barra Funda tem clima roceiro e receit...,comida,453
1,Professores de SP decidem manter greve; grupo ...,educacao,462
2,"Em segunda edição, concurso paga R$ 35 mil par...",empreendedorsocial,255
3,Usar maconha por anos não faz tão mal para a s...,equilibrioesaude,833
4,Baleia-azul percorre 5.200 km e revela a cient...,ciencia,771
5,Líderes inovadores se reunem em rede para comp...,empreendedorsocial,356
6,"Conheça Pandora, atração milionária da Disney ...",turismo,1087
7,Fiesp organiza quarta edição de maratona hacke...,empreendedorsocial,250
8,Praia do Forte mistura natureza com resorts es...,turismo,1152
9,App de agência avisa quando visto e passaporte...,turismo,615


In [24]:
dev_df = pd.read_csv('dev_text.csv')
dev_df['TotalWords'] = dev_df.apply(lambda row: get_total_words(row), axis=1)
dev_df

Unnamed: 0,text,TotalWords
0,"Vitrine de Dilma, Pronatec terá orçamento 65% ...",648
1,"Por direitos autorais e publicidade, 'youtuber...",660
2,Rótulos de alimentos terão que alertar sobre l...,866
3,Sociedade britânica de compositores processa S...,201
4,"Por Fies, aluna madruga na porta da FMU, mas s...",764
5,Cientistas aguardam nascimento raro de 'dragõe...,798
6,Aplicativo mostra quem está no mesmo voo que v...,693
7,Pesquisas indicam que sono na direção pode ser...,612
8,Brasil lançará missão à Lua até 2020 para estu...,1262
9,Cientistas testam vacina contra colesterol e d...,639


In [16]:
X = train_df[['text', 'TotalWords']]
Y = train_df['label']

In [19]:
classifier.fit(X, Y)
preds = classifier.predict(X)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [21]:
balanced_accuracy_score(Y, preds)

0.9994765083104306

In [22]:
f1_score(Y, preds, average='micro')

0.9994932860400304

In [27]:
write_predictions(classifier.predict(dev_df[['text', 'TotalWords']]), 'submission_xgboost.csv')

  Xt = transform.transform(Xt)


Saving predictions to submission_xgboost.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900

In [42]:
model = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('text')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words, ngram_range=(1,2)))
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('TotalWords')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', OneVsRestClassifier(LinearSVC(random_state=0, max_iter=3000, tol=1e-6, class_weight='balanced'))),
    ])

In [43]:
model.fit(X, Y)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('colext', TextSelector(field='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='co...    multi_class='ovr', penalty='l2', random_state=0, tol=1e-06, verbose=0),
          n_jobs=None))])

In [44]:
preds = model.predict(X)

  Xt = transform.transform(Xt)


In [45]:
balanced_accuracy_score(Y, preds)

0.9992147624656459

In [46]:
f1_score(Y, preds, average='micro')

0.9992399290600456

In [47]:
write_predictions(model.predict(dev_df[['text', 'TotalWords']]), 'submission_onevsrest.csv')

  Xt = transform.transform(Xt)


Saving predictions to submission_onevsrest.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 39