# Notebook probar diferentes clasificadores

Se pretende explorar diferentes clasificadores variando los hyperparametros, escogiendo el mejor resultado.

### 1. Importación librerias

In [49]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import  MultinomialNB
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.svm import LinearSVC

import time
import string

In [None]:
import nltk
nltk.download('popular')

In [26]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize

In [2]:
news = fetch_20newsgroups(subset="all",remove=('headers', 'footers', 'quotes'))

print("Número de articulos: {}".format(len(news.data)))
print("Número de categorias: {}".format(len(news.target_names)))

Número de articulos: 18846
Número de categorias: 20


### 2. Definción función de entrenamiento  

In [11]:
def train_fn(classifier, X, Y):
  start = time.time()
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
  classifier.fit(X_train, Y_train)
  end = time.time()
  score = classifier.score(X_test, Y_test)
  print("Test accuracy: {:.2f}% - Time duration: {:.2f} s".format(score * 100, (end-start)))
  return classifier

### 3. Preprocesado

In [27]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

### 3. Construcción de clasificadores

#### 3.1. Multinominal Naive Bayes Classifier

In [37]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english')+ list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, news.data, news.target)

Test accuracy: 71.27% - Time duration: 2.84 s
Test accuracy: 76.02% - Time duration: 2.79 s
Test accuracy: 75.86% - Time duration: 2.87 s
Test accuracy: 74.67% - Time duration: 2.86 s
Test accuracy: 73.55% - Time duration: 2.84 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [38]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.1))]
)
train_fn(NBclf, news.data, news.target)

  % sorted(inconsistent)


Test accuracy: 73.98% - Time duration: 87.26 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f61810e57a0>)),
                ('classifier', MultinomialNB(alpha=0.1))])

#### 3.2. Support Vector Classification con stochatic gradient descent

In [40]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, news.data, news.target)

Test accuracy: 5.07% - Time duration: 5.81 s
Test accuracy: 45.86% - Time duration: 6.73 s
Test accuracy: 73.08% - Time duration: 3.98 s
Test accuracy: 73.32% - Time duration: 3.89 s
Test accuracy: 75.46% - Time duration: 4.58 s
Test accuracy: 74.91% - Time duration: 4.56 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [45]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, news.data, news.target)

  % sorted(inconsistent)


Test accuracy: 75.65% - Time duration: 89.95 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f61810e57a0>)),
                ('classifier', SGDClassifier())])

#### 3.3. Support Vector Classification con liner SVC

In [47]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, news.data, news.target)

Test accuracy: 75.92% - Time duration: 5.22 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', LinearSVC())])

Con stemming.

In [48]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, news.data, news.target)

  % sorted(inconsistent)


Test accuracy: 76.05% - Time duration: 90.04 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f61810e57a0>)),
                ('classifier', LinearSVC())])

#### 3.4. Perceptron

In [50]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  perclf = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', Perceptron(alpha=alpha))]
  )

  train_fn(perclf, news.data, news.target)

Test accuracy: 70.50% - Time duration: 4.06 s
Test accuracy: 70.53% - Time duration: 3.93 s
Test accuracy: 69.76% - Time duration: 3.90 s
Test accuracy: 71.35% - Time duration: 3.98 s
Test accuracy: 70.53% - Time duration: 3.90 s
Test accuracy: 70.27% - Time duration: 3.97 s


In [55]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.001))]
)

train_fn(perclf, news.data, news.target)

Test accuracy: 71.01% - Time duration: 3.88 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', Perceptron(alpha=0.001))])