# Classificação de Texto

Exemplo de classificação de texto com pipeline de NLP e classificador no final

In [157]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score



In [2]:
# Tokenização
def tokenize(text):
    # Tokenização
    tokens = nltk.word_tokenize(text)
    
    # Stemização
    stems  = []
    for item in tokens:
        stems.append(SnowballStemmer("portuguese").stem(item))
    return stems

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#df = pd.read_csv('articles.csv')

# Carregando os dados
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Trabalho_Final/Case_Study/Eclipse_total.csv') # Salvamos com esse nome antes

# Dataset
df.head(10)

Unnamed: 0,Bug_report_ID,Product_Name,summary,description,component_name,severity_level,priority,assignee_developers,topic_id,status
0,221036,Community,unabl enter request parti code cq,report enter request parti librari submit java...,Project Management & Portal,blocker,P1,bjorn.freeman-benson,d_a_carver bjorn.freeman-benson karl.matthias,FIXED
1,221203,Community,submit request cq cq submit,submit request cq cq submit creat screen retur...,Project Management & Portal,blocker,P1,bjorn.freeman-benson,d_a_carver d_a_carver bjorn.freeman-benson kar...,FIXED
2,243952,Community,ca submit cq rd parti contribut develop apach,valid apach url enter url project page apach p...,Project Management & Portal,blocker,P1,bjorn.freeman-benson,ekuleshov ekuleshov bjorn.freeman-benson ekule...,FIXED
3,230229,Community,tool link fail ie,webmast email tool link respond js error error...,Project Management & Portal,blocker,P1,gabe.obrien,webmaster webmaster bjorn.freeman-benson karl....,FIXED
4,229095,Community,cron total broken,portal cron job broken friday gabe debug fix c...,Project Management & Portal,blocker,P1,portal-inbox,karl.matthias karl.matthias karl.matthias karl...,FIXED
5,109230,Community,gener roadmap project inform file,xslt css gener roadmap eclipse project info xm...,Process,critical,P1,eromero,bjorn.freeman-benson techtonik bjorn.freeman-b...,FIXED
6,275926,Community,cq for record blow address charact,appear addslash stripslash appli properli case...,Project Management & Portal,critical,P1,portal-inbox,karl.matthias karl.matthias karl.matthias,FIXED
7,208204,Community,javascript error submit cq,submit request refer distribut third parti cod...,Project Management & Portal,critical,P1,gabe.obrien,aldo_eisma bjorn.freeman-benson gabe.obrien ka...,FIXED
8,229830,Community,alexey pavlov nomin email address,portal allow nomin miss email address fix,Project Management & Portal,critical,P1,portal-inbox,karl.matthias bjorn.freeman-benson karl.matthi...,FIXED
9,191261,Community,role upload imag inconsist role edit web page,request design peopl role updat organ websit m...,Project Management & Portal,critical,P1,portal-inbox,donald.smith donald.smith bjorn.freeman-benson...,FIXED


In [5]:
class_names = df['Bug_report_ID'].unique()
class_names

array([221036, 221203, 243952, ...,   5729,   5732,   5727])

In [6]:
# Método para ordenar os bugs em ordem cronológica de acordo com o tempo de criação
df = df.sort_values(by=['Bug_report_ID'])
df.tail(10)

Unnamed: 0,Bug_report_ID,Product_Name,summary,description,component_name,severity_level,priority,assignee_developers,topic_id,status
16690,484674,Community,publish neon m instal,download locat neon instal home data httpd dow...,Website,normal,P3,phoenix.ui-inbox,stepper chris.guindon,FIXED
1872,484694,Community,name convent refer subproject,page http wiki eclipse org naming convent refe...,Process,normal,P3,emo,etienne.juliot wayne wayne etienne.juliot,FIXED
9953,484737,Community,resolved fix bug appear search assigned,bug mark resolved fix queri show assign resolu...,Bugzilla,normal,P3,webmaster,stephan.herrmann denis.roy stephan.herrmann,FIXED
13689,484768,Community,thym hipp miss gtk,thym hipp error log run ui test java lang unsa...,Hudson,blocker,P3,hudson.admin-inbox,mistria denis.roy denis.roy mistria mikael mis...,FIXED
11731,484769,Community,cla sign push reject,push gerrit reject sign cla day ago miss contr...,Gerrit,normal,P3,webmaster,rpmc22 webmaster,FIXED
1229,484804,Community,delet corrupt smarthom b releas repo,deploy job fail middl execut jvm crash incompl...,Nexus,normal,P3,webmaster,kai mikael,FIXED
15105,484808,Community,instal jdk swtbot hipp,order build test swtbot neon mileston jdk http...,Hudson,normal,P3,hudson.admin-inbox,lorenzo.bettini mikael lorenzo.bettini,FIXED
15106,484809,Community,upgrad jgit hipp hudson fail,upgrad jgit hipp hudson hipp servic portal app...,Hudson,critical,P3,hudson.admin-inbox,matthias.sohn mikael matthias.sohn,FIXED
15107,484850,Community,upgrad egit hipp hudson fail,upgrad egit hipp time upgrad fine hudson compl...,Hudson,normal,P3,hudson.admin-inbox,matthias.sohn mikael,FIXED
1953,484884,Community,add blog eclips triquetrum project,dear add triquetrum project blog planet eclips...,PlanetEclipse.org,normal,P3,planeteclipse.admin-inbox,erwindl0 genie jbr erwindl0 genie jbr,FIXED


In [37]:
class_names = df['status'].unique()
class_names

array(['FIXED'], dtype=object)

In [41]:
class_names = df['severity_level'].unique()
class_names

array(['normal', 'major', 'critical', 'enhancement', 'trivial', 'minor',
       'blocker'], dtype=object)

Removendo entradas diferentes de "blocker", "critical", "major","minor" e "trivial"

In [44]:
processedDataset = df[(df.severity_level == "blocker") | (df.severity_level == "critical") | (df.severity_level == "major") | (df.severity_level == "minor") | (df.severity_level == "trivial")]
processedDataset.head()

Unnamed: 0,Bug_report_ID,Product_Name,summary,description,component_name,severity_level,priority,assignee_developers,topic_id,status
24866,43,Platform,renam project releas gdhahf,project junk releas teamstream renam project a...,Team,major,P1,jean-michel_lemieux,Dave_Dykstal jean-michel_lemieux James_Moody d...,FIXED
25044,163,Platform,except disconnect gf qca,ak pm except night work home connect internet ...,Team,critical,P3,jean-michel_lemieux,jean-michel_lemieux jean-michel_lemieux James_...,FIXED
37141,343,PDE,dbcs pde pluin manifest editor display dbc cha...,dbcs pde sourc page plug in manifest editor di...,UI,critical,P3,dejan,dejan dj_houghton dejan,FIXED
37150,345,PDE,dbc except thread launch plugin project code r...,dbc except thread error messag launch plug in ...,UI,critical,P3,dejan,dejan dj_houghton rodrigo dejan,FIXED
21869,1749,Platform,debugg show sourc type gener fail execut runna...,problem problem ran target eclips debugg a ins...,UI,critical,P1,kai-uwe_maetzel,nick_edgar dj_houghton Darin_Swanson nick_edga...,FIXED


In [None]:
#Concatenando colunas
processedDataset["combinedSumAndDesc"] = processedDataset["summary"] + processedDataset["description"]
processedDataset.head()

In [None]:
#Removendo colunas desnecessárias
processedDataset.drop('summary', inplace=True, axis=1)
processedDataset.drop('description', inplace=True, axis=1)
processedDataset.drop('component_name', inplace=True, axis=1)


In [71]:
processedDataset.head()

Unnamed: 0,Bug_report_ID,Product_Name,severity_level,priority,assignee_developers,topic_id,status,combinedSumAndDesc
24866,43,Platform,major,P1,jean-michel_lemieux,Dave_Dykstal jean-michel_lemieux James_Moody d...,FIXED,renam project releas gdhahf project junk relea...
25044,163,Platform,critical,P3,jean-michel_lemieux,jean-michel_lemieux jean-michel_lemieux James_...,FIXED,except disconnect gf qca ak pm except night wo...
37141,343,PDE,critical,P3,dejan,dejan dj_houghton dejan,FIXED,dbcs pde pluin manifest editor display dbc cha...
37150,345,PDE,critical,P3,dejan,dejan dj_houghton rodrigo dejan,FIXED,dbc except thread launch plugin project code r...
21869,1749,Platform,critical,P1,kai-uwe_maetzel,nick_edgar dj_houghton Darin_Swanson nick_edga...,FIXED,debugg show sourc type gener fail execut runna...


In [49]:
#Retirando valores NaN
processedDataset['combinedSumAndDesc'].isnull().sum()

62

In [50]:
processedDataset = processedDataset.dropna()
processedDataset['combinedSumAndDesc'].isnull().sum()

0

In [52]:
#descobrindo a quantidade de linhas do DataFrame
processedDataset.shape

(7380, 8)

In [53]:
#Dividindo em 11 partes iguais, por ser um valor real adicionei +1 para corrigir o problema e certificar que todos os Bugs serão incluídos
a = (7380//11)+1
a

671

In [None]:
df0 = processedDataset.iloc[:(a*1)-1]
df0

In [73]:
df1 = processedDataset.iloc[(a*1):(a*2)-1]
df1

Unnamed: 0,Bug_report_ID,Product_Name,severity_level,priority,assignee_developers,topic_id,status,combinedSumAndDesc
35609,22508,JDT,minor,P3,aeschli,scadmira aeschli,FIXED,add variabl window small not persist build pat...
21986,22568,Platform,critical,P1,csmclaren,daniel_megert simon_arsenault daniel_megert si...,FIXED,key bind f f key broken build reason f f stop ...
35680,22573,JDT,minor,P3,aeschli,daniel_megert aeschli daniel_megert aeschli,FIXED,ca make type hierarchi histori dialog type hie...
31571,22624,JDT,major,P3,aeschli,aeschli aeschli,FIXED,renam cu quick fix except renam exist cu java ...
25252,22926,Platform,major,P3,veronika_irvine,knut_radloff knut_radloff knut_radloff m.moebi...,FIXED,clipboard copy past cut past window build fix ...
...,...,...,...,...,...,...,...,...
5535,46272,z_Archived,major,P3,richkulp,walkerp richkulp richkulp walkerp richkulp ric...,FIXED,save function w ve open throw except bean long...
31541,46329,JDT,major,P3,aeschli,dirk_baeumer dirk_baeumer dirk_baeumer aeschli...,FIXED,pasteactiontest test failt n convert rcp test ...
31555,46360,JDT,major,P3,jdt-ui-inbox,idzelis dirk_baeumer aeschli aeschli dirk_baeu...,FIXED,code gener newtyp templat present german local...
36756,46437,PDE,major,P1,pde-ui-inbox,jared_burns jared_burns jared_burns douglas.po...,FIXED,ctrl c copi build ctrl c copi text anymor


In [None]:
df2 = processedDataset.iloc[(a*2):(a*3)-1]
df2

In [None]:
df3 = processedDataset.iloc[(a*3):(a*4)-1]
df3

In [None]:
df4 = processedDataset.iloc[(a*4):(a*5)-1]
df4

In [None]:
df5 = processedDataset.iloc[(a*5):(a*6)-1]
df5

In [None]:
df6 = processedDataset.iloc[(a*6):(a*7)-1]
df6

In [None]:
df7 = processedDataset.iloc[(a*7):(a*8)-1]
df7

In [None]:
df8 = processedDataset.iloc[(a*8):(a*9)-1]
df8

In [None]:
df9 = processedDataset.iloc[(a*9):(a*10)-1]
df9

In [None]:
df10 = processedDataset.iloc[(a*10):(a*11)-1]
df10

Parte 1

In [98]:
#concatenando os dois DataFrames para treinamento e teste
DF = pd.merge(df0, df1, how = 'outer')
DF

Unnamed: 0,Bug_report_ID,Product_Name,severity_level,priority,assignee_developers,topic_id,status,combinedSumAndDesc
0,43,Platform,major,P1,jean-michel_lemieux,Dave_Dykstal jean-michel_lemieux James_Moody d...,FIXED,renam project releas gdhahf project junk relea...
1,163,Platform,critical,P3,jean-michel_lemieux,jean-michel_lemieux jean-michel_lemieux James_...,FIXED,except disconnect gf qca ak pm except night wo...
2,343,PDE,critical,P3,dejan,dejan dj_houghton dejan,FIXED,dbcs pde pluin manifest editor display dbc cha...
3,345,PDE,critical,P3,dejan,dejan dj_houghton rodrigo dejan,FIXED,dbc except thread launch plugin project code r...
4,1749,Platform,critical,P1,kai-uwe_maetzel,nick_edgar dj_houghton Darin_Swanson nick_edga...,FIXED,debugg show sourc type gener fail execut runna...
...,...,...,...,...,...,...,...,...
1335,46272,z_Archived,major,P3,richkulp,walkerp richkulp richkulp walkerp richkulp ric...,FIXED,save function w ve open throw except bean long...
1336,46329,JDT,major,P3,aeschli,dirk_baeumer dirk_baeumer dirk_baeumer aeschli...,FIXED,pasteactiontest test failt n convert rcp test ...
1337,46360,JDT,major,P3,jdt-ui-inbox,idzelis dirk_baeumer aeschli aeschli dirk_baeu...,FIXED,code gener newtyp templat present german local...
1338,46437,PDE,major,P1,pde-ui-inbox,jared_burns jared_burns jared_burns douglas.po...,FIXED,ctrl c copi build ctrl c copi text anymor


In [99]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.5, train_size=0.5, random_state=42)
X_train.shape

(670, 1)

In [100]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [101]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.3940298507462687

Parte 2

In [102]:
DF = pd.merge(DF, df2, how = 'outer')
DF

Unnamed: 0,Bug_report_ID,Product_Name,severity_level,priority,assignee_developers,topic_id,status,combinedSumAndDesc
0,43,Platform,major,P1,jean-michel_lemieux,Dave_Dykstal jean-michel_lemieux James_Moody d...,FIXED,renam project releas gdhahf project junk relea...
1,163,Platform,critical,P3,jean-michel_lemieux,jean-michel_lemieux jean-michel_lemieux James_...,FIXED,except disconnect gf qca ak pm except night wo...
2,343,PDE,critical,P3,dejan,dejan dj_houghton dejan,FIXED,dbcs pde pluin manifest editor display dbc cha...
3,345,PDE,critical,P3,dejan,dejan dj_houghton rodrigo dejan,FIXED,dbc except thread launch plugin project code r...
4,1749,Platform,critical,P1,kai-uwe_maetzel,nick_edgar dj_houghton Darin_Swanson nick_edga...,FIXED,debugg show sourc type gener fail execut runna...
...,...,...,...,...,...,...,...,...
2005,71823,PDE,critical,P3,pde-ui-inbox,ppshah ppshah sonia_dimitrov ppshah wassim.mel...,FIXED,empti manifest file includ updat site download...
2006,71854,z_Archived,major,P1,richkulp,pramod_varma gmendel walkerp walkerp walkerp r...,FIXED,start ve design composit start develop gener r...
2007,71872,JDT,minor,P3,jdt-text-inbox,kaly_white Olivier_Thomann daniel_megert danie...,FIXED,misc javadoc view display link tag correctli v...
2008,71939,z_Archived,major,P1,gmendel,steve.turner richkulp steve.turner richkulp st...,FIXED,bean viewer stop introspect abstract method cr...


In [103]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.33, train_size=0.67, random_state=42)
X_train.shape

(1346, 1)

In [104]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [105]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.39759036144578314

Parte 3

In [106]:
DF = pd.merge(DF, df3, how = 'outer')
DF

Unnamed: 0,Bug_report_ID,Product_Name,severity_level,priority,assignee_developers,topic_id,status,combinedSumAndDesc
0,43,Platform,major,P1,jean-michel_lemieux,Dave_Dykstal jean-michel_lemieux James_Moody d...,FIXED,renam project releas gdhahf project junk relea...
1,163,Platform,critical,P3,jean-michel_lemieux,jean-michel_lemieux jean-michel_lemieux James_...,FIXED,except disconnect gf qca ak pm except night wo...
2,343,PDE,critical,P3,dejan,dejan dj_houghton dejan,FIXED,dbcs pde pluin manifest editor display dbc cha...
3,345,PDE,critical,P3,dejan,dejan dj_houghton rodrigo dejan,FIXED,dbc except thread launch plugin project code r...
4,1749,Platform,critical,P1,kai-uwe_maetzel,nick_edgar dj_houghton Darin_Swanson nick_edga...,FIXED,debugg show sourc type gener fail execut runna...
...,...,...,...,...,...,...,...,...
2675,105103,z_Archived,major,P3,jstinton,camle camle gmendel richkulp jstinton richkulp,FIXED,tvt tct ve english popup ve paus os rhel fix b...
2676,105184,PDE,major,P3,pde-ui-inbox,mtveety mtveety konradk konradk wasleski konradk,FIXED,tvt tct truncat text os rhel fix sever sever b...
2677,105235,Platform,major,P3,Michael_Rennie,kitlo kitlo cocoakevin Tod_Creasey kitlo Tod_C...,FIXED,tvt tct mnemon char middl dbc menu label strin...
2678,105295,PDE,major,P3,pde-ui-inbox,molk molk konradk molk konradk molk jeffmcaffe...,FIXED,product export export work product export triv...


In [125]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.25, train_size=0.75, random_state=42)
X_train.shape

(4020, 1)

In [126]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [127]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.4305970149253731

Parte 4

In [None]:
DF = pd.merge(DF, df4, how = 'outer')
DF

In [129]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.2, train_size=0.8, random_state=42)
X_train.shape

(4324, 1)

In [130]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [131]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.43530499075785584

Parte 5

In [None]:
DF = pd.merge(DF, df5, how = 'outer')
DF

In [133]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.16, train_size=0.84, random_state=42)
X_train.shape

(4569, 1)

In [134]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [135]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.4500574052812859

Parte 6

In [None]:
DF = pd.merge(DF, df6, how = 'outer')
DF

In [137]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.14, train_size=0.86, random_state=42)
X_train.shape

(4683, 1)

In [138]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [139]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.4796854521625164

Parte 7

In [None]:
DF = pd.merge(DF, df7, how = 'outer')
DF

In [141]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.125, train_size=0.875, random_state=42)
X_train.shape

(4803, 1)

In [142]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [143]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.49344978165938863

Parte 8

In [None]:
DF = pd.merge(DF, df8, how = 'outer')
DF

In [145]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.11, train_size=0.89, random_state=42)
X_train.shape

(5482, 1)

In [146]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [147]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.4646017699115044

Parte 9

In [None]:
DF = pd.merge(DF, df9, how = 'outer')
DF

In [149]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.1, train_size=0.9, random_state=42)
X_train.shape

(6147, 1)

In [150]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [151]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.4465592972181552

Parte 10

In [None]:
DF = pd.merge(DF, df10, how = 'outer')
DF

In [153]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(DF[['combinedSumAndDesc']], DF.severity_level,  test_size = 0.09, train_size=0.91, random_state=42)
X_train.shape

(6825, 1)

In [154]:
# Stop Words

nltk.download('stopwords')

nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('portuguese')

# NLP Pipeline
text_clf   = Pipeline([
                # Vectorize
                ('vect',  TfidfVectorizer(tokenizer=tokenize, 
                                          stop_words=stop_words, 
                                          ngram_range=(1,1))),
                # Classificador
                ('clf',   KNeighborsClassifier(n_jobs=-1)),
            ])

# Train
text_clf = text_clf.fit(X_train.combinedSumAndDesc, y_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [155]:
# Evaluate
text_clf.score(X_test.combinedSumAndDesc, y_test)

0.4222222222222222

In [158]:
predictions = text_clf.predict(X_test.combinedSumAndDesc)
f1_score(y_test, predictions, average='weighted')

0.4115622365663197