# Notebook probar diferentes clasificadores

Se pretende explorar diferentes clasificadores variando los hyperparametros, escogiendo el mejor resultado.

### 1. Importación librerias

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import  MultinomialNB
from sklearn.linear_model import SGDClassifier, Perceptron, LogisticRegression
from sklearn.svm import LinearSVC

import re
import time
import string
import pandas as pd
import numpy as np

In [None]:
import nltk
nltk.download('popular')

In [2]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import word_tokenize

In [3]:
news = fetch_20newsgroups(subset="all",remove=('headers', 'footers', 'quotes'))

print("Número de articulos: {}".format(len(news.data)))
print("Número de categorias: {}".format(len(news.target_names)))

Número de articulos: 18846
Número de categorias: 20


### 2. Definción función de entrenamiento  

In [4]:
def train_fn(classifier, X, Y):
  start = time.time()
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
  classifier.fit(X_train, Y_train)
  end = time.time()
  score = classifier.score(X_test, Y_test)
  print("Test accuracy: {:.2f}% - Time duration: {:.2f} s".format(score * 100, (end-start)))
  return classifier

### 3. Preprocesado

#### 3.1. Limpiar el texto

In [5]:
def clean_text(news):

  df = pd.DataFrame(news.data, columns=['text'])
  df["categories"] = [news.target_names[i] for i in news.target]
  df["labels"] = [i for i in news.target]

  signos = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
  # Remove signo puntuacion 
  df["text"] = [re.sub(signos, ' ', sent) for sent in df["text"]]
  # Remove whitspace star and end text
  df['text'] = df['text'].str.strip()
  # Remove Emails
  df["text"] = [re.sub('\S*@\S*\s?', '', sent) for sent in df["text"]]
  # Remove distracting single quotes
  df["text"] = [re.sub("\'", "", sent) for sent in df["text"]]
  # Remove any rows with empty fields
  df = df.replace('', np.NaN).dropna()
  # Drop duplicates
  df = df.drop_duplicates(subset='text')
  # Remove number
  df['text'] = [re.sub('\d+', ' ', text) for text in df['text']]
  # lower
  df["text"] = [text.lower() for text in df["text"]]
  # Remove multiples whitspaces characters
  df["text"] = [re.sub('\s+', ' ', sent) for sent in df["text"]]
  return df

#### 3.2. Stemming

In [6]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

#### 3.3. Lemmatization

In [7]:
#defining the function for lemmatization
def lemmatizer_tokenizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [8]:
df = clean_text(news)
data = df["text"].values
target = df["labels"].values

### 4. Construcción de clasificadores - TfidfVectorizer

#### 4.1. Multinominal Naive Bayes Classifier

In [9]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english')+ list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, data, target)

Test accuracy: 73.04% - Time duration: 2.89 s
Test accuracy: 77.34% - Time duration: 3.00 s
Test accuracy: 78.36% - Time duration: 2.62 s
Test accuracy: 77.70% - Time duration: 2.45 s
Test accuracy: 75.18% - Time duration: 4.16 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [10]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)



Test accuracy: 77.84% - Time duration: 76.04 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', MultinomialNB(alpha=0.01))])

In [11]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)



Test accuracy: 78.99% - Time duration: 32.84 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x0000021466D63550>)),
                ('classifier', MultinomialNB(alpha=0.01))])

#### 4.2. Support Vector Classification con stochatic gradient descent

In [12]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, data, target)

Test accuracy: 10.55% - Time duration: 4.39 s
Test accuracy: 59.48% - Time duration: 5.24 s
Test accuracy: 75.32% - Time duration: 3.26 s
Test accuracy: 74.11% - Time duration: 3.31 s
Test accuracy: 78.27% - Time duration: 4.27 s
Test accuracy: 75.84% - Time duration: 3.66 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [13]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, data, target)



Test accuracy: 76.74% - Time duration: 77.23 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', SGDClassifier())])

In [14]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, data, target)



Test accuracy: 77.40% - Time duration: 32.61 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x0000021466D63550>)),
                ('classifier', SGDClassifier())])

#### 4.3. Support Vector Classification con liner SVC

In [15]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

Test accuracy: 77.07% - Time duration: 4.30 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', LinearSVC())])

Con stemming.

In [16]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)



Test accuracy: 77.37% - Time duration: 75.60 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', LinearSVC())])

In [17]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)



Test accuracy: 77.45% - Time duration: 31.56 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x0000021466D63550>)),
                ('classifier', LinearSVC())])

#### 4.4. Perceptron

In [18]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  perclf = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', Perceptron(alpha=alpha))]
  )

  train_fn(perclf, data, target)

Test accuracy: 71.12% - Time duration: 3.46 s
Test accuracy: 70.77% - Time duration: 3.00 s
Test accuracy: 71.23% - Time duration: 3.20 s
Test accuracy: 71.95% - Time duration: 3.11 s
Test accuracy: 71.23% - Time duration: 3.11 s
Test accuracy: 71.04% - Time duration: 3.31 s


In [19]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.001))]
)

train_fn(perclf, data, target)



Test accuracy: 71.45% - Time duration: 71.30 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', Perceptron(alpha=0.001))])

In [20]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer, stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.00001))]
)

train_fn(perclf, data, target)



Test accuracy: 72.03% - Time duration: 30.27 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x0000021466D63550>)),
                ('classifier', Perceptron(alpha=1e-05))])

#### 4.5. Logistic Regression

In [21]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LogisticRegression(max_iter=10000))]
)

train_fn(perclf, data, target)



Test accuracy: 76.79% - Time duration: 107.58 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', LogisticRegression(max_iter=10000))])

In [22]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LogisticRegression(max_iter=10000))]
)

train_fn(perclf, data, target)



Test accuracy: 75.62% - Time duration: 68.77 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x0000021466D63550>)),
                ('classifier', LogisticRegression(max_iter=10000))])

### 5. Construcción de clasificadores - CountVectorizer

#### 5.1. Multinominal Naive Bayes Classifier

In [23]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, data, target)

Test accuracy: 70.25% - Time duration: 2.27 s
Test accuracy: 73.04% - Time duration: 2.22 s
Test accuracy: 74.19% - Time duration: 2.48 s
Test accuracy: 73.15% - Time duration: 2.35 s
Test accuracy: 72.82% - Time duration: 2.33 s


In [24]:
NBclf = Pipeline(
    [('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)



Test accuracy: 73.01% - Time duration: 62.91 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', MultinomialNB(alpha=0.01))])

#### 5.2. Support Vector Classification con stochatic gradient descent

In [25]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, data, target)

Test accuracy: 51.92% - Time duration: 3.30 s
Test accuracy: 64.08% - Time duration: 3.78 s
Test accuracy: 73.45% - Time duration: 3.61 s
Test accuracy: 71.12% - Time duration: 3.83 s
Test accuracy: 64.16% - Time duration: 4.64 s
Test accuracy: 64.49% - Time duration: 6.11 s


In [26]:
SVMclf1 = Pipeline(
    [('vectorizer',CountVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.01))]
  )

train_fn(SVMclf1, data, target)



Test accuracy: 73.37% - Time duration: 71.38 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x0000021466D634C0>)),
                ('classifier', SGDClassifier(alpha=0.01))])