In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import word_tokenize
import nltk
from nltk import SnowballStemmer
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from unidecode import unidecode
import json
import string
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

%matplotlib inline

##### Obtenção dos dados

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/prof-renato/data/main/humor_detection.csv')
df.dropna(inplace=True)

### Tratando textos

##### Importando um JSON com uma lista de palavras a serem substituidas

In [3]:
with open('contraction_map.json') as file:
    contration_map = json.load(file)

In [4]:
def apply_stemming(list_tokens, stemmer):
    return [stemmer.stem(token) for token in list_tokens]

In [5]:
def removeWords(listTokens, listWords):
    return [token for token in listTokens if token not in listWords]

In [6]:
def contration_words(text):
    for k, v in contration_map.items():
        text = text.replace(k, v)
    return text
    

In [7]:
def textProcess(text):
    stopWords = nltk.corpus.stopwords.words('english')

    text = text.replace(u'\ufffd', '8')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.rstrip('\n')
    text = contration_words(text)

    listTokens = word_tokenize(text)
    listTokens= removeWords(listTokens, stopWords)

    text = " ".join(listTokens)
    text = unidecode(text)
    return text

In [8]:
random_forest_pipe = Pipeline([
    ('CountVectorizer', CountVectorizer(analyzer=textProcess)),
    ('TDFID', TfidfTransformer()),
    ('RandomForest', RandomForestClassifier())
])

multinomialNB_pipe = Pipeline([
    ('CountVectorizer', CountVectorizer(analyzer=textProcess)),
    ('TDFID', TfidfTransformer()),
    ('RandomForest', MultinomialNB())
])

logistic_regression_pipe = Pipeline([
    ('CountVectorizer', CountVectorizer(analyzer=textProcess)),
    ('TDFID', TfidfTransformer()),
    ('RandomForest', LogisticRegression())
])

In [9]:
df['humor'] = pd.to_numeric(df['humor'])

In [10]:
X = df['text']
y = df['humor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


##### Utilizando o cross validantion

In [11]:
logistic_regression_model = cross_validate(logistic_regression_pipe, X, y, scoring=['accuracy', 'f1', 'roc_auc'], return_train_score=True)
random_forest_model = cross_validate(random_forest_pipe, X, y, scoring=['accuracy', 'f1', 'roc_auc'], return_train_score=True)
multinomialnb_model = cross_validate(multinomialNB_pipe, X, y, scoring=['accuracy', 'f1', 'roc_auc'], return_train_score=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [25]:
resultado = pd.DataFrame({
    'labels': ['accuracy', 'f1', 'roc_auc'],
    'logistic': [logistic_regression_model['test_accuracy'].max(), logistic_regression_model['test_f1'].max(), logistic_regression_model['test_roc_auc'].max()],
    'random_forest': [random_forest_model['test_accuracy'].max(), random_forest_model['test_f1'].max(), random_forest_model['test_roc_auc'].max()],
    'multinomialNB_forest': [multinomialnb_model['test_accuracy'].max(), multinomialnb_model['test_f1'].max(), multinomialnb_model['test_roc_auc'].max()]
}).set_index('labels')
resultado.index.name=None
resultado = resultado.transpose()    
resultado.style.applymap(lambda x: 'background-color: lightgreen' if x >= 0.90 else '')

Unnamed: 0,accuracy,f1,roc_auc
logistic,0.792316,0.781166,0.861245
random_forest,0.809518,0.798332,0.882209
multinomialNB_forest,0.760297,0.746542,0.814479
