In [46]:
from __future__ import print_function
from time import time

import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt

import numpy as np
import sklearn
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [29]:
bag_df = pd.read_csv('bag_words.csv', sep=';', index_col=0)

In [30]:
bag_df.columns = ['link', 'categoria', 'texto']

In [31]:
bag_df.head()

Unnamed: 0,link,categoria,texto
0,http://g1.globo.com/economia/agronegocios/agro...,agro,criação peixes cativeiro brasil expansão país ...
1,http://g1.globo.com/economia/negocios/noticia/...,agro,vale anunciou manhã desta segundafeira venda a...
2,http://g1.globo.com/economia/agronegocios/noti...,agro,acordo ibge abate somou milhões cabeças maior ...
3,http://g1.globo.com/sp/piracicaba-regiao/notic...,agro,universidade paulo usp piracicabasp anunciou i...
4,http://g1.globo.com/economia/midia-e-marketing...,agro,mcdonalds saladas compostas vegetais orgânicos...


In [32]:
bag_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 705 entries, 0 to 704
Data columns (total 3 columns):
link         705 non-null object
categoria    705 non-null object
texto        694 non-null object
dtypes: object(3)
memory usage: 22.0+ KB


In [33]:
bag_df.describe()

Unnamed: 0,link,categoria,texto
count,705,705,694
unique,696,6,687
top,http://ciencia.estadao.com.br/blogs/herton-esc...,ciencia-e-saude,desafio manequim pessoas parkinson quer consci...
freq,2,146,2


In [34]:
bag_df_clean = bag_df.dropna()
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
    bag_df_clean.texto, bag_df_clean.categoria
)

In [41]:
for Model in (RandomForestClassifier, LogisticRegression, MultinomialNB):
    for Vect in (CountVectorizer, TfidfVectorizer):
        vect = Vect()
        model = Model()
        pipe = Pipeline([('vect', vect), ('model', model)])
        
        scores = cross_val_score(pipe, bag_df_clean.texto, bag_df_clean.categoria, cv=5)
        print("Accuracy %s with %s: %0.2f (+/- %0.2f)" % (Model.__name__, Vect.__name__, scores.mean(), scores.std() * 2))

Accuracy RandomForestClassifier with CountVectorizer: 0.66 (+/- 0.06)
Accuracy RandomForestClassifier with TfidfVectorizer: 0.67 (+/- 0.08)
Accuracy LogisticRegression with CountVectorizer: 0.79 (+/- 0.11)
Accuracy LogisticRegression with TfidfVectorizer: 0.76 (+/- 0.12)
Accuracy MultinomialNB with CountVectorizer: 0.80 (+/- 0.15)
Accuracy MultinomialNB with TfidfVectorizer: 0.73 (+/- 0.10)


In [43]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [51]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    #'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

In [53]:
for Model in (RandomForestClassifier, LogisticRegression, MultinomialNB):
    for Vect in (CountVectorizer, TfidfVectorizer):
        vect = Vect(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None)
        model = Model()
        pipe = Pipeline([('vect', vect), ('model', model)])
        
        grid_search = GridSearchCV(pipe, param_grid=parameters, n_jobs=1)
        start = time()
        grid_search.fit(bag_df_clean.texto, bag_df_clean.categoria)
        
        print("GridSearchCV with %s and %s took %.2f seconds for %d candidate parameter settings."
          % (Model.__name__, Vect.__name__, time() - start, len(grid_search.cv_results_['params'])))
        report(grid_search.cv_results_)
        print("")

GridSearchCV with RandomForestClassifier and CountVectorizer took 14.63 seconds for 6 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.650 (std: 0.046)
Parameters: {'vect__ngram_range': (1, 1), 'vect__max_df': 1.0}

Model with rank: 2
Mean validation score: 0.648 (std: 0.018)
Parameters: {'vect__ngram_range': (1, 1), 'vect__max_df': 0.75}

Model with rank: 3
Mean validation score: 0.644 (std: 0.041)
Parameters: {'vect__ngram_range': (1, 1), 'vect__max_df': 0.5}


GridSearchCV with RandomForestClassifier and TfidfVectorizer took 15.03 seconds for 6 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.643 (std: 0.024)
Parameters: {'vect__ngram_range': (1, 1), 'vect__max_df': 1.0}

Model with rank: 2
Mean validation score: 0.622 (std: 0.039)
Parameters: {'vect__ngram_range': (1, 1), 'vect__max_df': 0.5}

Model with rank: 3
Mean validation score: 0.614 (std: 0.041)
Parameters: {'vect__ngram_range': (1, 2), 'vect__max_df': 0.5}


GridSearchCV wi

In [76]:
parameters = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, ), #1000, 5000, 10000),
    'vect__max_df': (1.0, ),
    'vect__min_df': (0.0, ), #0.2, 0.4), #0.6, 0.8, 1.0),
    'vect__binary': (False, ),
    'vect__analyzer': ('word', ), #'char'),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'model__alpha': (0.1, ), #(2.0, 1.0, 0.5, 0.1, 0.01),
    'model__fit_prior': (False, ), #(True, False),
    #'model__class_prior': (None, 1, 10, 50, 100),
}

In [77]:
for Model in (MultinomialNB,):
    for Vect in (CountVectorizer,):
        vect = Vect()
        model = Model()
        pipe = Pipeline([('vect', vect), ('model', model)])
        
        grid_search = GridSearchCV(pipe, param_grid=parameters, n_jobs=1)
        start = time()
        grid_search.fit(bag_df_clean.texto, bag_df_clean.categoria)
        
        print("GridSearchCV with %s and %s took %.2f seconds for %d candidate parameter settings."
          % (Model.__name__, Vect.__name__, time() - start, len(grid_search.cv_results_['params'])))
        report(grid_search.cv_results_)
        print("")

GridSearchCV with MultinomialNB and CountVectorizer took 1.00 seconds for 1 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.805 (std: 0.062)
Parameters: {'vect__analyzer': 'word', 'model__alpha': 0.1, 'vect__max_df': 1.0, 'model__fit_prior': False, 'vect__binary': False, 'vect__min_df': 0.0, 'vect__max_features': None}


