# Notebook probar diferentes clasificadores

Se pretende explorar diferentes clasificadores variando los hyperparametros, escogiendo el mejor resultado.

### 1. Importación librerias

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import  MultinomialNB
from sklearn.linear_model import SGDClassifier, Perceptron, LogisticRegression
from sklearn.svm import LinearSVC

import re
import time
import string
import pandas as pd
import numpy as np

In [None]:
import nltk
nltk.download('popular')

In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import word_tokenize

In [5]:
news = fetch_20newsgroups(subset="all",remove=('headers', 'footers', 'quotes'))

print("Número de articulos: {}".format(len(news.data)))
print("Número de categorias: {}".format(len(news.target_names)))

Número de articulos: 18846
Número de categorias: 20


### 2. Definción función de entrenamiento  

In [6]:
def train_fn(classifier, X, Y):
  start = time.time()
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
  classifier.fit(X_train, Y_train)
  end = time.time()
  score = classifier.score(X_test, Y_test)
  print("Test accuracy: {:.2f}% - Time duration: {:.2f} s".format(score * 100, (end-start)))
  return classifier

### 3. Preprocesado

#### 3.1. Limpiar el texto

In [7]:
def clean_text(news):

  df = pd.DataFrame(news.data, columns=['text'])
  df["categories"] = [news.target_names[i] for i in news.target]
  df["labels"] = [i for i in news.target]

  # Remove multiples whitspaces characters
  df["text"] = [re.sub('\s+', ' ', sent) for sent in df["text"]]
  # Remove whitspace star and end text
  df['text'] = df['text'].str.strip()
  # Remove Emails
  df["text"] = [re.sub('\S*@\S*\s?', '', sent) for sent in df["text"]]
  # Remove distracting single quotes
  df["text"] = [re.sub("\'", "", sent) for sent in df["text"]]
  # Remove any rows with empty fields
  df = df.replace('', np.NaN).dropna()
  # Drop duplicates
  df = df.drop_duplicates(subset='text')
  # Remove number
  df['text'] = [re.sub(r'\[[0-9]*\]', ' ', text) for text in df['text']]
  # lower
  df["text"] = [text.lower() for text in df["text"]]
  return df

#### 3.2. Stemming

In [8]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

#### 3.3. Lemmatization

In [11]:
#defining the function for lemmatization
def lemmatizer_tokenizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [10]:
df = clean_text(news)
data = df["text"].values
target = df["labels"].values

### 4. Construcción de clasificadores - TfidfVectorizer

#### 4.1. Multinominal Naive Bayes Classifier

In [19]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english')+ list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, data, target)

Test accuracy: 73.62% - Time duration: 2.60 s
Test accuracy: 78.14% - Time duration: 2.69 s
Test accuracy: 78.91% - Time duration: 2.66 s
Test accuracy: 78.17% - Time duration: 2.62 s
Test accuracy: 75.92% - Time duration: 2.65 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [20]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)

  % sorted(inconsistent)


Test accuracy: 79.05% - Time duration: 80.74 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f8727181320>)),
                ('classifier', MultinomialNB(alpha=0.01))])

In [21]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)

  % sorted(inconsistent)


Test accuracy: 78.88% - Time duration: 40.30 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f872706b710>)),
                ('classifier', MultinomialNB(alpha=0.01))])

#### 4.2. Support Vector Classification con stochatic gradient descent

In [22]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, data, target)

Test accuracy: 5.45% - Time duration: 5.47 s
Test accuracy: 50.31% - Time duration: 6.44 s
Test accuracy: 75.32% - Time duration: 3.59 s
Test accuracy: 75.62% - Time duration: 3.60 s
Test accuracy: 77.70% - Time duration: 4.15 s
Test accuracy: 75.79% - Time duration: 4.24 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [23]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, data, target)

  % sorted(inconsistent)


Test accuracy: 76.09% - Time duration: 82.32 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f8727181320>)),
                ('classifier', SGDClassifier())])

In [24]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, data, target)

  % sorted(inconsistent)


Test accuracy: 75.73% - Time duration: 41.58 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f872706b710>)),
                ('classifier', SGDClassifier())])

#### 4.3. Support Vector Classification con liner SVC

In [25]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

Test accuracy: 79.02% - Time duration: 4.85 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', LinearSVC())])

Con stemming.

In [26]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

  % sorted(inconsistent)


Test accuracy: 77.35% - Time duration: 83.61 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f8727181320>)),
                ('classifier', LinearSVC())])

In [27]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

  % sorted(inconsistent)


Test accuracy: 77.46% - Time duration: 44.04 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f872706b710>)),
                ('classifier', LinearSVC())])

#### 4.4. Perceptron

In [28]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  perclf = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', Perceptron(alpha=alpha))]
  )

  train_fn(perclf, data, target)

Test accuracy: 72.45% - Time duration: 3.76 s
Test accuracy: 71.84% - Time duration: 3.79 s
Test accuracy: 72.47% - Time duration: 3.68 s
Test accuracy: 71.49% - Time duration: 3.71 s
Test accuracy: 70.75% - Time duration: 3.71 s
Test accuracy: 72.34% - Time duration: 3.68 s


In [29]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.00001))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 70.36% - Time duration: 80.86 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f8727181320>)),
                ('classifier', Perceptron(alpha=1e-05))])

In [30]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer, stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.00001))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 72.06% - Time duration: 41.51 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f872706b710>)),
                ('classifier', Perceptron(alpha=1e-05))])

#### 4.5. Logistic Regression

In [32]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LogisticRegression(max_iter=10000))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 73.71% - Time duration: 154.60 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f8727181320>)),
                ('classifier', LogisticRegression(max_iter=10000))])

In [33]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LogisticRegression(max_iter=10000))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 73.40% - Time duration: 120.85 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f872706b710>)),
                ('classifier', LogisticRegression(max_iter=10000))])

### 5. Construcción de clasificadores - CountVectorizer

#### 5.1. Multinominal Naive Bayes Classifier

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, data, target)

Test accuracy: 68.06% - Time duration: 2.77 s
Test accuracy: 73.46% - Time duration: 2.78 s
Test accuracy: 74.36% - Time duration: 2.80 s
Test accuracy: 72.34% - Time duration: 2.95 s
Test accuracy: 71.73% - Time duration: 2.86 s


In [None]:
NBclf = Pipeline(
    [('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)

  % sorted(inconsistent)


Test accuracy: 73.19% - Time duration: 83.91 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f83a113d290>)),
                ('classifier', MultinomialNB(alpha=0.01))])

#### 5.2. Support Vector Classification con stochatic gradient descent

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, data, target)

Test accuracy: 52.78% - Time duration: 4.49 s
Test accuracy: 63.30% - Time duration: 5.04 s
Test accuracy: 73.98% - Time duration: 4.75 s
Test accuracy: 69.71% - Time duration: 5.25 s
Test accuracy: 66.20% - Time duration: 5.95 s
Test accuracy: 65.90% - Time duration: 6.57 s


In [None]:
SVMclf1 = Pipeline(
    [('vectorizer',CountVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.01))]
  )

train_fn(SVMclf1, data, target)

  % sorted(inconsistent)


Test accuracy: 70.69% - Time duration: 86.73 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f83a113d290>)),
                ('classifier', SGDClassifier(alpha=0.01))])