In [0]:
#1 - Importanto as bibliotecas
import re  
import nltk  
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV 
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
#2 - Carregando a base de dados
newsgroups_data = fetch_20newsgroups()
x, y = newsgroups_data.data, newsgroups_data.target  

In [0]:
#3 - Pré-processamento do texto
documents = []
stemmer = WordNetLemmatizer()

for sen in range(0, len(x)):  
    # Removendo todos os caracteres especiais
    document = re.sub(r'\W', ' ', str(x[sen]))

    # removendo todos os caracteres isolados
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Removendo caracter isolado do ínicio
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituindo multiplos espaços por um único espaço
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Convertendo todas as palavras do documento para lower case
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [0]:
steps = [
        ('tfidf', TfidfVectorizer(max_df=0.90, min_df=0.05)),
        ('svc', SVC())
]
pipe = Pipeline(steps=steps)
params = {
        'tfidf__max_df':[0.25,0.5,0.75,0.85,0.95,1.0],
        'tfidf__min_df':[0.0,0.01,0.02,0.03,0.04,0.05],
        'svc__C':[1,2,3,4,5,6,7,8,9,10],                                 #quanto maior o parametro C, mais tolerante a erros
        'svc__kernel':['linear','poly','rbf','sigmoid','precomputed'],   #especifica qual função será utilizada no núcleo a ser usado no algoritmo.
        'svc__degree':[1,2,3,4,5,6,7,8,9,10],                            #degree =  grau da função 'poly'
        'svc__gamma':['auto']                                            #Coeficiente para as funções ‘rbf’, ‘poly’ and ‘sigmoid’ do núcleo.
}
scoring = ['accuracy','f1_micro', 'f1_macro']

In [0]:
clf = GridSearchCV(pipe, params, cv=10, n_jobs=-1, verbose=True,scoring=scoring,refit='accuracy',return_train_score=True)
clf.fit(documents,y)

In [0]:
df = pd.DataFrame(clf.cv_results_)
df.to_csv('resultado_pipeline_svm.csv')