<a href="https://colab.research.google.com/github/santiagoruiz-udea/News_classification/blob/main/Several_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook probar diferentes clasificadores

Se pretende explorar diferentes clasificadores variando los hyperparametros, escogiendo el mejor resultado.

## 1. Importación librerias

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import  MultinomialNB
from sklearn.linear_model import SGDClassifier, Perceptron, LogisticRegression
from sklearn.svm import LinearSVC

import re
import time
import string
import pandas as pd
import numpy as np

In [None]:
import nltk
nltk.download('popular')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import word_tokenize

In [None]:
categories = [
    "comp.graphics",
    "misc.forsale",
    "rec.autos",
    "rec.motorcycles",
    "rec.sport.baseball",
    "sci.electronics",
    "sci.space",
    "talk.religion.misc",
    "talk.politics.misc",
    "alt.atheism",
    "comp.sys.ibm.pc.hardware",
    "rec.sport.hockey",
    "sci.crypt",
    "sci.med",
    "talk.politics.guns",
]
news = fetch_20newsgroups(subset="all", categories=categories,remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)

print("Número de articulos: {}".format(len(news.data)))
print("Número de categorias: {}".format(len(news.target_names)))

Número de articulos: 13973
Número de categorias: 15


## 2. Definción función de entrenamiento  

In [None]:
def train_fn(classifier, X, Y):
  start = time.time()
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
  classifier.fit(X_train, Y_train)
  end = time.time()
  score = classifier.score(X_test, Y_test)
  print("Test accuracy: {:.2f}% - Time duration: {:.2f} s".format(score * 100, (end-start)))
  return classifier

## 3. Preprocesado

### 3.1. Limpiar el texto

In [None]:
def clean_text(news):

  df = pd.DataFrame(news.data, columns=['text'])
  df["categories"] = [news.target_names[i] for i in news.target]
  df["labels"] = [i for i in news.target]

  # Remove multiples whitspaces characters
  df["text"] = [re.sub('\s+', ' ', sent) for sent in df["text"]]
  # Remove whitspace star and end text
  df['text'] = df['text'].str.strip()
  # Remove Emails
  df["text"] = [re.sub('\S*@\S*\s?', '', sent) for sent in df["text"]]
  # Remove distracting single quotes
  df["text"] = [re.sub("\'", "", sent) for sent in df["text"]]
  # Remove any rows with empty fields
  df = df.replace('', np.NaN).dropna()
  # Drop duplicates
  df = df.drop_duplicates(subset='text')
  # Remove number
  df['text'] = [re.sub(r'\[[0-9]*\]', ' ', text) for text in df['text']]
  # lower
  df["text"] = [text.lower() for text in df["text"]]
  # Remove multiples whitspaces characters
  df["text"] = [re.sub('\s+', ' ', sent) for sent in df["text"]]
  return df

### 3.2. Stemming

In [None]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

### 3.3. Lemmatization

In [None]:
#defining the function for lemmatization
def lemmatizer_tokenizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [None]:
df = clean_text(news)
data = df["text"].values
target = df["labels"].values

## 4. Construcción de clasificadores - TfidfVectorizer

### 4.1. Multinominal Naive Bayes Classifier

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english')+ list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, data, target)

Test accuracy: 77.33% - Time duration: 1.71 s
Test accuracy: 80.02% - Time duration: 1.81 s
Test accuracy: 81.28% - Time duration: 1.73 s
Test accuracy: 79.10% - Time duration: 1.75 s
Test accuracy: 78.43% - Time duration: 1.78 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [None]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)

  % sorted(inconsistent)


Test accuracy: 81.39% - Time duration: 58.29 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', MultinomialNB(alpha=0.01))])

Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y Lemmatizer.

In [None]:
NBclf = Pipeline(
    [('vectorizer', TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)

  % sorted(inconsistent)


Test accuracy: 82.02% - Time duration: 28.96 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f3b39e85b00>)),
                ('classifier', MultinomialNB(alpha=0.01))])

### 4.2. Support Vector Classification con stochatic gradient descent

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, data, target)

Test accuracy: 6.46% - Time duration: 3.25 s
Test accuracy: 57.50% - Time duration: 4.22 s
Test accuracy: 77.70% - Time duration: 3.04 s
Test accuracy: 76.55% - Time duration: 2.23 s
Test accuracy: 79.95% - Time duration: 2.60 s
Test accuracy: 79.43% - Time duration: 2.49 s


Se escoge el mejor desempeño y se realiza prueba eliminado signo de puntuación y stemming.

In [None]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, data, target)

  % sorted(inconsistent)


Test accuracy: 78.95% - Time duration: 56.57 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', SGDClassifier())])

In [None]:
SVMclf1 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.0001))]
  )

train_fn(SVMclf1, data, target)

  % sorted(inconsistent)


Test accuracy: 79.58% - Time duration: 28.81 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f3b39e85b00>)),
                ('classifier', SGDClassifier())])

### 4.3. Support Vector Classification con liner SVC

In [None]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

Test accuracy: 81.17% - Time duration: 2.73 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', LinearSVC())])

Con stemming.

In [None]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

  % sorted(inconsistent)


Test accuracy: 79.69% - Time duration: 56.67 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', LinearSVC())])

In [None]:
SVMclf2 = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LinearSVC())]
)

train_fn(SVMclf2, data, target)

  % sorted(inconsistent)


Test accuracy: 79.39% - Time duration: 28.68 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f3b39e85b00>)),
                ('classifier', LinearSVC())])

### 4.4. Perceptron

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  perclf = Pipeline(
      [('vectorizer',TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', Perceptron(alpha=alpha))]
  )

  train_fn(perclf, data, target)

Test accuracy: 73.38% - Time duration: 2.27 s
Test accuracy: 74.22% - Time duration: 2.28 s
Test accuracy: 75.37% - Time duration: 2.22 s
Test accuracy: 76.11% - Time duration: 2.26 s
Test accuracy: 74.22% - Time duration: 2.16 s
Test accuracy: 74.08% - Time duration: 2.21 s


In [None]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.001))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 75.15% - Time duration: 60.63 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', Perceptron(alpha=0.001))])

In [None]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer, stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', Perceptron(alpha=0.001))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 75.59% - Time duration: 27.78 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f3b39e85b00>)),
                ('classifier', Perceptron(alpha=0.001))])

### 4.5. Logistic Regression

In [None]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LogisticRegression(max_iter=10000))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 77.51% - Time duration: 91.72 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', LogisticRegression(max_iter=10000))])

In [None]:
perclf = Pipeline(
    [('vectorizer',TfidfVectorizer(tokenizer=lemmatizer_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', LogisticRegression(max_iter=10000))]
)

train_fn(perclf, data, target)

  % sorted(inconsistent)


Test accuracy: 76.44% - Time duration: 64.15 s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function lemmatizer_tokenizer at 0x7f3b39e85b00>)),
                ('classifier', LogisticRegression(max_iter=10000))])

## 5. Construcción de clasificadores - CountVectorizer

### 5.1. Multinominal Naive Bayes Classifier

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001]:
  NBclf = Pipeline(
      [('vectorizer', CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),
      ('classifier', MultinomialNB(alpha=alpha))]
  )

  train_fn(NBclf, data, target)

Test accuracy: 76.74% - Time duration: 1.68 s
Test accuracy: 78.47% - Time duration: 1.67 s
Test accuracy: 78.14% - Time duration: 1.71 s
Test accuracy: 76.88% - Time duration: 1.76 s
Test accuracy: 76.77% - Time duration: 1.67 s


In [None]:
NBclf = Pipeline(
    [('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.01))]
)
train_fn(NBclf, data, target)

  % sorted(inconsistent)


Test accuracy: 77.51% - Time duration: 56.54 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', MultinomialNB(alpha=0.01))])

### 5.2. Support Vector Classification con stochatic gradient descent

In [None]:
for alpha in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:  
  SVMclf1 = Pipeline(
      [('vectorizer',CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))), 
      ('classifier', SGDClassifier(alpha=alpha))]
  )

  train_fn(SVMclf1, data, target)

Test accuracy: 59.16% - Time duration: 2.60 s
Test accuracy: 66.77% - Time duration: 2.83 s
Test accuracy: 75.44% - Time duration: 2.58 s
Test accuracy: 74.15% - Time duration: 2.99 s
Test accuracy: 69.09% - Time duration: 3.19 s
Test accuracy: 68.76% - Time duration: 3.19 s


In [None]:
SVMclf1 = Pipeline(
    [('vectorizer',CountVectorizer(tokenizer=stemming_tokenizer,stop_words=stopwords.words('english') + list(string.punctuation))), 
    ('classifier', SGDClassifier(alpha=0.01))]
  )

train_fn(SVMclf1, data, target)

  % sorted(inconsistent)


Test accuracy: 70.24% - Time duration: 57.33 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function stemming_tokenizer at 0x7f3b39ebab90>)),
                ('classifier', SGDClassifier(alpha=0.01))])