In [80]:
import warnings
import re
import pickle

import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from nltk.stem.snowball import PortugueseStemmer

In [20]:
# importando o arquivo csv e lendo com o PANDAS
df_path   = "../data/raw/olist_order_reviews_dataset.csv"
used_cols = ['review_score', 'review_comment_message']


In [33]:
df = (
    pd
    .read_csv(df_path,usecols = used_cols)
    .dropna()# Remoção de linhas sem informação
    .reset_index()#
    .drop(columns=['index'], axis=1)
    .query('review_score != 3')
)

df['review_score'] = df['review_score'].apply(lambda x: 0 if x <3 else 1)

In [43]:
df['review_score'].value_counts()

1    26530
0    10890
Name: review_score, dtype: int64

In [44]:
stemmer   = PortugueseStemmer()
stop_words = stopwords.words('portuguese')

In [71]:
analyzer = TfidfVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = TfidfVectorizer(analyzer=stemmed_words,stop_words=stop_words,max_features=3700)

In [82]:
models = {
    'lr'   : LogisticRegression(max_iter=1000),
    'dtree': DecisionTreeClassifier(max_depth=100),
    'svc'  : SVC(max_iter=1001),
    'gnb'  : GaussianNB()
}

In [72]:
transformer = Pipeline(
    steps= [
        ('stemmer_tfidf', stem_vectorizer)
    ]
)

In [47]:
clf = LogisticRegression()
model =  Pipeline(steps=[('preprocessor',transformer), ('clf',clf)])

In [59]:
X = df['review_comment_message'].to_list()
y = df[['review_score']].values[:,0]#.to_list()

In [83]:
{model_name:None for model_name in models}

{'lr': None, 'dtree': None, 'svc': None, 'gnb': None}

In [84]:
results = {model_name:None for model_name in models}
for name, clf in models.items():
    model  =  Pipeline(steps=[('preprocessor',transformer), ('clf',clf)])
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=10)
    results[name] = scores

Traceback (most recent call last):
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 207, in fit
    X, y = self._validate_data(X, y)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 814, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py",

Traceback (most recent call last):
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 207, in fit
    X, y = self._validate_data(X, y)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 814, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py",

Traceback (most recent call last):
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 207, in fit
    X, y = self._validate_data(X, y)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 814, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\Trisna\anaconda3\lib\site-packages\sklearn\utils\validation.py",

In [87]:
for model_name, result in results.items():
    print(f'{model_name}: {result.mean()}')

lr: 0.9292624265098878
dtree: 0.8898984500267237
svc: 0.8946285408872262
gnb: nan


In [88]:
model = Pipeline(steps=[('preprocessor',transformer), ('clf',models['lr'])])

In [89]:
model.fit(X,y)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('stemmer_tfidf',
                                  TfidfVectorizer(analyzer=<function stemmed_words at 0x0000023E919CC040>,
                                                  max_features=3700,
                                                  stop_words=['de', 'a', 'o',
                                                              'que', 'e', 'é',
                                                              'do', 'da', 'em',
                                                              'um', 'para',
                                                              'com', 'não',
                                                              'uma', 'os', 'no',
                                                              'se', 'na', 'por',
                                                              'mais', 'as',
                                                              'dos', 'como',
                                    

In [90]:
pickle.dump(model,open('lr_model.pkl','wb'))