In [1]:
import sys, os
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import average_precision_score

In [2]:
pd.set_option('display.max_columns',210)
pd.set_option('display.max_rows',100)

In [3]:
result_path = './Resultados/'

In [4]:
transform_name = ['Bow','Bow_stopwords','Bow_stopwords_stemming','trunc_Bow','trunc_Bow_stopwords','trunc_Bow_stopwords_stemming']

In [5]:
def format_params(x):
    
    p_array = ['SVM__kernel','SVM__C','SVM__gamma','SVM__coef0','SVM__degree']

    s = []
    for i in range(0,len(x)):
        s.append(p_array[i][5:] + '=' + str(x[p_array[i]]))
    
    return ','.join(s)

In [6]:
grid_0 = pd.read_pickle(os.path.join(result_path,'grid_0_results.pkl'))
grid_0.insert(7,'transform',transform_name[0])

grid_1 = pd.read_pickle(os.path.join(result_path,'grid_1_results.pkl'))
grid_1.insert(7,'transform',transform_name[1])

grid_2 = pd.read_pickle(os.path.join(result_path,'grid_2_results.pkl'))
grid_2.insert(7,'transform',transform_name[2])

grid_3 = pd.read_pickle(os.path.join(result_path,'grid_3_results.pkl'))
grid_3.insert(7,'transform',transform_name[3])

grid_4 = pd.read_pickle(os.path.join(result_path,'grid_4_results.pkl'))
grid_4.insert(7,'transform',transform_name[4])

grid_5 = pd.read_pickle(os.path.join(result_path,'grid_5_results.pkl'))
grid_5.insert(7,'transform',transform_name[5])

grid_6 = pd.read_pickle(os.path.join(result_path,'grid_6_results.pkl'))
grid_6.insert(7,'transform',transform_name[0])

grid_7 = pd.read_pickle(os.path.join(result_path,'linear_rbf_results.pkl'))
grid_7.rename(columns={'param_C':'param_SVM__C','param_kernel':'param_SVM__kernel','param_gamma':'param_SVM__gamma'},inplace=True)
grid_7.insert(7,'transform','NonLinguisticFeature')

grid_8 = pd.read_pickle(os.path.join(result_path,'poly_results.pkl'))
grid_8.rename(columns={'param_C':'param_SVM__C','param_kernel':'param_SVM__kernel','param_gamma':'param_SVM__gamma',
                       'param_coef0':'param_SVM__coef0','param_degree':'param_SVM__degree'},inplace=True)
grid_8.insert(7,'transform','NonLinguisticFeature')

results = pd.concat([grid_0,grid_1,grid_2,grid_3,grid_4,grid_5,grid_6,grid_7,grid_8])

In [7]:
len(results)

1758

Resultados filtrados pela feature e pelo kernel utilizado

In [8]:
# Resultados filtrados pela feature e pelo kernel utilizado
idx = results.groupby(['transform','param_SVM__kernel'])['rank_test_f1'].transform(min) == results['rank_test_f1']
best_results = results[idx][['transform','param_SVM__kernel', 'param_SVM__C','param_SVM__gamma','param_SVM__coef0','param_SVM__degree','mean_test_f1','mean_test_precision','mean_test_recall','mean_test_accuracy']].sort_values('mean_test_f1',ascending=False)

# Empate: filtro por menor parâmetro C
idx = best_results.groupby(['transform','param_SVM__kernel'])['param_SVM__C'].transform(min) == best_results['param_SVM__C']
best_results = best_results[idx].sort_values(['transform','mean_test_f1'],ascending=False)

# Empate: filtro por menor grau polinomial
best_results.drop(index=8,inplace=True)

# Print
best_results['transform'] = best_results['transform'].replace('NonLinguisticFeature','LinguisticFeature')
best_results

Unnamed: 0,transform,param_SVM__kernel,param_SVM__C,param_SVM__gamma,param_SVM__coef0,param_SVM__degree,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy
18,trunc_Bow_stopwords_stemming,rbf,200.0,1.0,,,0.950016,0.956272,0.943925,0.950174
74,trunc_Bow_stopwords_stemming,poly,2.0,1.0,0.0,4.0,0.947926,0.947923,0.948079,0.947743
2,trunc_Bow_stopwords_stemming,linear,20.0,,,,0.940131,0.948263,0.932156,0.940451
14,trunc_Bow_stopwords,rbf,20.0,1.0,,,0.94803,0.949127,0.94704,0.947917
66,trunc_Bow_stopwords,poly,2.0,1.0,0.0,2.0,0.945843,0.942709,0.949117,0.945486
0,trunc_Bow_stopwords,linear,0.2,,,,0.942426,0.944123,0.94081,0.942361
14,trunc_Bow,rbf,20.0,1.0,,,0.951736,0.958262,0.94531,0.95191
74,trunc_Bow,poly,2.0,1.0,0.0,4.0,0.950855,0.950916,0.950848,0.950694
1,trunc_Bow,linear,2.0,,,,0.946465,0.953674,0.939425,0.946701
102,LinguisticFeature,poly,20000.0,0.01,100.0,2.0,0.936585,0.933071,0.940161,0.936111


In [71]:
print('melhor modelo:')
best_results[best_results['mean_test_f1']==best_results['mean_test_f1'].max()]

melhor modelo:


Unnamed: 0,transform,param_SVM__kernel,param_SVM__C,param_SVM__gamma,param_SVM__coef0,param_SVM__degree,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy
48,Bow_stopwords,poly,0.2,0.01,10,3,0.964879,0.959944,0.969886,0.964583


In [13]:
best_results.to_pickle(result_path + 'best_results.pkl')

In [10]:
best_results = pd.read_pickle(result_path + 'best_results.pkl')
best_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_SVM__C,param_SVM__kernel,param_SVM__gamma,transform,param_SVM__coef0,param_SVM__degree,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_test_precision,split1_test_precision,split2_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_recall,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall
0,15.099708,0.126846,6.907343,0.044155,0.2,linear,,Bow,,,"{'SVM__C': 0.2, 'SVM__kernel': 'linear'}",0.947917,0.953646,0.960938,0.954167,0.005328,162,0.949084,0.954005,0.961360,0.954816,0.005045,168,0.931069,0.949588,0.953988,0.944882,0.009931,162,0.967809,0.958463,0.968847,0.965040,0.004670,93
1,12.427664,0.281993,5.696365,0.118526,2,linear,,Bow,,,"{'SVM__C': 2.0, 'SVM__kernel': 'linear'}",0.958854,0.961979,0.964063,0.961632,0.002140,26,0.958918,0.961920,0.964119,0.961652,0.002131,26,0.960417,0.966457,0.965625,0.964166,0.002673,19,0.957425,0.957425,0.962617,0.959155,0.002448,124
2,12.631106,0.177226,5.821412,0.017072,20,linear,,Bow,,,"{'SVM__C': 20.0, 'SVM__kernel': 'linear'}",0.958333,0.962500,0.963021,0.961285,0.002098,43,0.958290,0.962539,0.963117,0.961315,0.002152,49,0.962304,0.964546,0.963617,0.963489,0.000920,60,0.954309,0.960540,0.962617,0.959155,0.003530,124
3,12.597670,0.107651,5.810339,0.061359,200,linear,,Bow,,,"{'SVM__C': 200.0, 'SVM__kernel': 'linear'}",0.958333,0.962500,0.963021,0.961285,0.002098,43,0.958290,0.962539,0.963117,0.961315,0.002152,49,0.962304,0.964546,0.963617,0.963489,0.000920,60,0.954309,0.960540,0.962617,0.959155,0.003530,124
4,12.585069,0.113875,5.807990,0.064380,2000,linear,,Bow,,,"{'SVM__C': 2000.0, 'SVM__kernel': 'linear'}",0.958333,0.962500,0.963021,0.961285,0.002098,43,0.958290,0.962539,0.963117,0.961315,0.002152,49,0.962304,0.964546,0.963617,0.963489,0.000920,60,0.954309,0.960540,0.962617,0.959155,0.003530,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,4.055627,0.655584,0.030998,0.000357,20000,poly,0.001,NonLinguisticFeature,100,2,"{'C': 20000.0, 'coef0': 100.0, 'degree': 2, 'g...",0.935417,0.927083,0.930729,0.931076,0.003411,65,0.936148,0.926931,0.931969,0.931683,0.003768,65,0.928498,0.932773,0.919273,0.926848,0.005633,67,0.943925,0.921162,0.945021,0.936703,0.010998,59
104,120.922808,38.981368,0.037695,0.006664,20000,poly,0.01,NonLinguisticFeature,100,3,"{'C': 20000.0, 'coef0': 100.0, 'degree': 3, 'g...",0.938542,0.927604,0.935937,0.934028,0.004665,29,0.938797,0.927566,0.936500,0.934288,0.004844,36,0.937824,0.931937,0.932169,0.933977,0.002722,4,0.939772,0.923237,0.940871,0.934626,0.008066,78
105,14.974653,4.311661,0.033424,0.003073,20000,poly,0.001,NonLinguisticFeature,100,3,"{'C': 20000.0, 'coef0': 100.0, 'degree': 3, 'g...",0.936979,0.926562,0.935937,0.933160,0.004684,47,0.937468,0.926905,0.936891,0.933755,0.004849,45,0.933128,0.926425,0.926904,0.928819,0.003053,56,0.941848,0.927386,0.947095,0.938777,0.008334,47
106,230.358412,81.221869,0.046886,0.012758,20000,poly,0.01,NonLinguisticFeature,100,4,"{'C': 20000.0, 'coef0': 100.0, 'degree': 4, 'g...",0.942187,0.928125,0.932813,0.934375,0.005846,23,0.942872,0.928200,0.933402,0.934825,0.006074,22,0.934694,0.931106,0.929085,0.931629,0.002319,21,0.951194,0.925311,0.937759,0.938088,0.010569,51
