# Classificação de Texto no contexto da Predição de Severidade de Bug Reports

Exemplo de classificação de texto com pipeline de NLP e classificador no final

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

#Classificadores
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


In [None]:
# Tokenização
def tokenize(text):
    # Tokenização
    tokens = nltk.word_tokenize(text)
    
    # Stemização
    stems  = []
    for item in tokens:
        stems.append(SnowballStemmer("english").stem(item))
    return stems

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Realiza as modificações no dataset
def GetProcessedDataset(df):
  #Convertendo 'summary' e 'description' para string
  df = df.astype({'summary':'str', 'description':'str'})

  #Filtrando severidades confiaveis
  df = df.loc[(df['severity_level'] != 'enhancement') & (df['severity_level'] != 'normal')]

  #Modificando severidade diferente de blocker (Classificação binaria)
  df.loc[(df['severity_level'] != 'blocker'), 'severity_level'] = "no-blocker"

  return df

In [None]:
# Baixando as Stop Words
nltk.download('stopwords')

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Realiza a separação do dataframe
def SplitDataSet(dataset):
  # Calcula tamanho
  dataSetLen = len(dataset) // 11

  # Calcula indexes
  firstIdx = 0
  lastIdx = 0

  dfs = []
  for i in range(11):
    # Calculando os indexes para retirar os dados do dataframe
    if(i != 0):
      firstIdx += dataSetLen
    lastIdx += dataSetLen

    # Realizando o iloc do datraframe entre firstIdx e lastIdx
    dfs.append(dataset.iloc[firstIdx:lastIdx])

  return dfs

In [None]:
# Realiza o treinamento e retorna a acurácia
def TrainDataset(X_train,X_test,y_train,y_test):
  stop_words = nltk.corpus.stopwords.words('english')

  # NLP Pipeline
  text_clf   = Pipeline([
                  # Vectorize
                  ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                            stop_words=stop_words, 
                                            ngram_range=(1,2))),
                  # Classificador
                  ('clf',   SVC(kernel='linear', C=1E10)),
              ])

  # Train
  text_clf = text_clf.fit(X_train.description, y_train)

  # Testando e retornando a acurácia
  pred = text_clf.predict(X_test.description)
  accuracy_scoreMLP_TFIDF = f1_score(y_test, pred, average='micro')

  return accuracy_scoreMLP_TFIDF

In [None]:
for datasetNum in range(5):
  print(f"===>Training dataset: {datasetNum}")

  # Carregando o dataset
  if datasetNum == 0:
    loadedDataset = pd.read_csv('/content/drive/My Drive/TrabFinal_Mineracao/Datasets/Eclipse_total.csv')
  elif datasetNum == 1:
    loadedDataset = pd.read_csv('/content/drive/My Drive/TrabFinal_Mineracao/Datasets/GCC_total.csv')
  elif datasetNum == 2:
    loadedDataset = pd.read_csv('/content/drive/My Drive/TrabFinal_Mineracao/Datasets/Mozilla_total.csv')
  elif datasetNum == 3:
    loadedDataset = pd.read_csv('/content/drive/My Drive/TrabFinal_Mineracao/Datasets/Netbeans_total.csv')
  elif datasetNum == 4:
    loadedDataset = pd.read_csv('/content/drive/My Drive/TrabFinal_Mineracao/Datasets/OpenOffice_total.csv')

  # Realizando o processamento do dataset (pré-processamento)
  processedDataframe = GetProcessedDataset(loadedDataset)

  # Divindo o dataframe em 11 partes
  dfs = SplitDataSet(processedDataframe)

  # Guarda as acurácias
  all_accuracy = []

  # Treinando 10 vezes, a partir das combinações
  for i in range(10):
    # Combinando os dataframes i e i + 1 para teste e treino
    combinedDataframe = dfs[i].append(dfs[i+1])

    # Realizando o split
    X_train, X_test, y_train, y_test = train_test_split(combinedDataframe[['description']], combinedDataframe.severity_level, random_state=42)

    # Treinando os dados e pegando a acurácia
    trainedAcc = TrainDataset(X_train,X_test,y_train,y_test)
    all_accuracy.append(trainedAcc)

    # Atualizando o dataframe para pegarmos o combinado na próxima iteração
    dfs[i+1] = combinedDataframe

  # Calculando a acurácia final
  finalAcc = 0

  # Somando as acurácias
  for j in range(10):
    print(f"\tDataset {datasetNum}: Accuracy from training {j}: {all_accuracy[j]}")
    finalAcc += all_accuracy[j]

  # Mostrando a acurácia média
  print(f"\t\tDataset {datasetNum} final accuracy: {finalAcc / 10}")

===>Training dataset: 0
	Dataset 0: Accuracy from training 0: 0.8372781065088757
	Dataset 0: Accuracy from training 1: 0.8639053254437871
	Dataset 0: Accuracy from training 2: 0.8609467455621301
	Dataset 0: Accuracy from training 3: 0.834319526627219
	Dataset 0: Accuracy from training 4: 0.8648915187376726
	Dataset 0: Accuracy from training 5: 0.8605240912933221
	Dataset 0: Accuracy from training 6: 0.8809171597633136
	Dataset 0: Accuracy from training 7: 0.886259040105194
	Dataset 0: Accuracy from training 8: 0.8940828402366863
	Dataset 0: Accuracy from training 9: 0.8983324367939752
		Dataset 0 final accuracy: 0.8681456791072175
===>Training dataset: 1
	Dataset 1: Accuracy from training 0: 0.864
	Dataset 1: Accuracy from training 1: 0.8663101604278075
	Dataset 1: Accuracy from training 2: 0.9196787148594378
	Dataset 1: Accuracy from training 3: 0.9198717948717948
	Dataset 1: Accuracy from training 4: 0.9197860962566845
	Dataset 1: Accuracy from training 5: 0.9220183486238532
	Dataset