# Classificação de Texto

Exemplo de classificação de texto com pipeline de NLP e classificador no final

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score



In [3]:
# Stop Words
nltk.download('stopwords')

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Tokenização
def tokenize(text):
    # Tokenização
    tokens = nltk.word_tokenize(text)
    
    # Stemização
    stems  = []
    for item in tokens:
        stems.append(SnowballStemmer("portuguese").stem(item))
    return stems

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
def ProcessedData(df):

  # Método para ordenar os bugs em ordem cronológica de acordo com o tempo de criação
  df = df.sort_values(by=['Bug_report_ID'])

  # Removendo entradas diferentes de "blocker", "critical", "major","minor" e "trivial"
  processedDataset = df[(df.severity_level == "blocker") | (df.severity_level == "critical") | (df.severity_level == "major") | (df.severity_level == "minor") | (df.severity_level == "trivial")]

  #Concatenando colunas
  processedDataset["combinedSumAndDesc"] = processedDataset["summary"] + processedDataset["description"]

  #Removendo colunas desnecessárias
  processedDataset.drop('summary', inplace=True, axis=1)
  processedDataset.drop('description', inplace=True, axis=1)
  processedDataset.drop('component_name', inplace=True, axis=1)

  #Retirando valores NaN
  processedDataset['combinedSumAndDesc'].isnull().sum()
  processedDataset = processedDataset.dropna()

  return processedDataset

In [32]:
# Realiza a separação do dataframe
def SplitDataSet(dataset):
  # Calcula tamanho
  dataSetLen = len(dataset) // 11

  # Calcula indexes
  firstIdx = 0
  lastIdx = 0

  dfs = []
  for i in range(11):
    # Calculando os indexes para retirar os dados do dataframe
    if(i != 0):
      firstIdx += dataSetLen
    lastIdx += dataSetLen

    # Realizando o iloc do datraframe entre firstIdx e lastIdx
    dfs.append(dataset.iloc[firstIdx:lastIdx])

  return dfs

In [31]:
# Realiza o treinamento e retorna a acurácia
def TrainDataset(X_train,X_test,y_train,y_test):
  # Stop Words
  stop_words = nltk.corpus.stopwords.words('portuguese')

  # NLP Pipeline
  text_clf   = Pipeline([
                  # Vectorize
                  ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                            stop_words=stop_words, 
                                            ngram_range=(1,1))),
                  # Classificador
                  ('clf',   KNeighborsClassifier(n_jobs=-1)),
              ])

  # Train
  text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)
  # Test 
  predictions = text_clf.predict(X_test.combinedSumAndDesc)
  f1 = f1_score(y_test, predictions, average='micro')

  return f1

In [36]:
for datasetNum in range(5):
  print(f"===>Training dataset: {datasetNum}")

  # Carregando o dataset
  if datasetNum == 0:
    loadedDataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Trabalho_Final/Case_Study/Eclipse_total.csv')
  elif datasetNum == 1:
    loadedDataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Trabalho_Final/Case_Study/GCC_total.csv')
  elif datasetNum == 2:
    loadedDataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Trabalho_Final/Case_Study/Mozilla_total.csv')
  elif datasetNum == 3:
    loadedDataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Trabalho_Final/Case_Study/Netbeans_total.csv')
  elif datasetNum == 4:
    loadedDataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Trabalho_Final/Case_Study/OpenOffice_total.csv')

  processedDataFrame = ProcessedData(loadedDataset)

  dfs = SplitDataSet(processedDataFrame)

  all_accuracy = []

  for i in range(10):
    combinedDataFrame = dfs[i].append(dfs[i+1])
    # Split Dataset
    X_train, X_test, y_train, y_test = train_test_split(combinedDataFrame[['combinedSumAndDesc']], combinedDataFrame.severity_level, random_state=42)

    # Treinando os dados e pegando a acurácia
    Trained = TrainDataset(X_train, X_test, y_train, y_test)
    all_accuracy.append(Trained)

    # Atualizando o dataframe para pegarmos o combinado na próxima iteração
    dfs[i+1] = combinedDataFrame

    # Calculando a acurácia final
    finalAcc = 0

  # Somando as acurácias
  for j in range(10):
    print(f"\tDataset {datasetNum}: Accuracy from training {j}: {all_accuracy[j]}")
    finalAcc += all_accuracy[j]

  # Mostrando a acurácia média
  print(f"\t\tDataset {datasetNum} final accuracy: {finalAcc / 10}")

===>Training dataset: 0
	Dataset 0: Accuracy from training 0: 0.4000000000000001
	Dataset 0: Accuracy from training 1: 0.40159045725646125
	Dataset 0: Accuracy from training 2: 0.4343283582089552
	Dataset 0: Accuracy from training 3: 0.41408114558472553
	Dataset 0: Accuracy from training 4: 0.40696517412935324
	Dataset 0: Accuracy from training 5: 0.4441602728047741
	Dataset 0: Accuracy from training 6: 0.43656716417910446
	Dataset 0: Accuracy from training 7: 0.4204244031830239
	Dataset 0: Accuracy from training 8: 0.41074626865671643
	Dataset 0: Accuracy from training 9: 0.41942485078676073
		Dataset 0 final accuracy: 0.4188288094789875
===>Training dataset: 1
	Dataset 1: Accuracy from training 0: 0.992
	Dataset 1: Accuracy from training 1: 0.9893048128342246
	Dataset 1: Accuracy from training 2: 0.9678714859437751
	Dataset 1: Accuracy from training 3: 0.9198717948717948
	Dataset 1: Accuracy from training 4: 0.8796791443850268
	Dataset 1: Accuracy from training 5: 0.8555045871559633
