In [223]:
# Importando as bibliotecas que serão utilizadas
import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier

from sklearn import preprocessing

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import *
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
set_config(display='diagram')



from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.metrics import plot_confusion_matrix, f1_score, make_scorer
from sklearn.model_selection import train_test_split

In [224]:
# importando o dataset e obtendo uma vista geral do mesmo
df_trip = pd.read_csv('tripadvisor_reviews.csv', sep=';')

In [225]:
# Substituindo virgulas por ponto para poder trabalhar com o dataset, outra opção seria alterar o local
df_trip = df_trip.apply(lambda x: x.str.replace(',','.'))

In [226]:
# Checando a tipagem do dos dados
df_trip.dtypes

usuario                     object
media_galerias_arte         object
media_baladas               object
media_loja_sucos            object
media_restaurantes          object
media_museus                object
media_resorts               object
media_parques               object
media_praias                object
media_teatros               object
media_templos_religiosos    object
dtype: object

In [227]:
# Alterar o tipo de dados, pois estavam como objeto
df_trip[["media_galerias_arte", "media_baladas", "media_loja_sucos","media_restaurantes","media_museus","media_resorts","media_parques","media_praias","media_teatros","media_templos_religiosos"]]=df_trip[["media_galerias_arte", "media_baladas", "media_loja_sucos","media_restaurantes","media_museus","media_resorts","media_parques","media_praias","media_teatros","media_templos_religiosos"]].apply(pd.to_numeric)

In [228]:
# Conferindo se foi realizada a alteração com sucesso
df_trip.dtypes

usuario                      object
media_galerias_arte         float64
media_baladas               float64
media_loja_sucos            float64
media_restaurantes          float64
media_museus                float64
media_resorts               float64
media_parques               float64
media_praias                float64
media_teatros               float64
media_templos_religiosos    float64
dtype: object

In [229]:
# Estabelecendo os critérios para treinamento
X_train, X_test, y_train, y_test = train_test_split(df_trip.drop('media_baladas', axis = 1),
                                                    df_trip['media_baladas'],
                                                    test_size = 0.25,
                                                    random_state=20)

In [230]:
# Apagando colunas que não são importantes
df_trip = df_trip.drop(columns=['usuario'], axis = 1)

In [231]:
# convertendo colunas tipo texto
encoder_df = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(X_train)
X_train = encoder_df.transform(X_train)

encoders_df = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(X_test)
X_test = encoders_df.transform(X_test)

In [232]:
# Verificando quantidade de linhas e colunas
nRow, nCol = df_trip.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 980 rows and 10 columns


In [233]:
# Algumas informações essenciais do dataset
df_trip.describe()

Unnamed: 0,media_galerias_arte,media_baladas,media_loja_sucos,media_restaurantes,media_museus,media_resorts,media_parques,media_praias,media_teatros,media_templos_religiosos
count,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0
mean,0.893194,1.352612,1.013306,0.5325,0.939735,1.842898,3.180939,2.835061,1.569439,2.799224
std,0.326912,0.47828,0.788607,0.279731,0.43743,0.539538,0.007824,0.137505,0.364629,0.32138
min,0.34,0.0,0.13,0.15,0.06,0.14,3.16,2.42,0.74,2.14
25%,0.67,1.08,0.27,0.41,0.64,1.46,3.18,2.74,1.31,2.54
50%,0.83,1.28,0.82,0.5,0.9,1.8,3.18,2.82,1.54,2.78
75%,1.02,1.56,1.5725,0.58,1.2,2.2,3.18,2.91,1.76,3.04
max,3.22,3.64,3.62,3.44,3.3,3.76,3.21,3.39,3.17,3.66


In [234]:
# Verificando se existem dados nulos
df_trip[pd.isna(df_trip).any(axis=1)]

Unnamed: 0,media_galerias_arte,media_baladas,media_loja_sucos,media_restaurantes,media_museus,media_resorts,media_parques,media_praias,media_teatros,media_templos_religiosos


In [235]:
modelo_treinado.score(X_test, y_test)

0.04406235411117421

In [236]:
# fazendo o treinamento, com método de regressão
modelo_treinado = SVR().fit(X_train, y_train)
y_pred = modelo_treinado.predict(X_test)

In [237]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [238]:
# fazendo o treinamento, com método de regressão, utilizando XGBoost
modelo_treinadoXGB = XGBRegressor(random_state=0).fit(X_train, y_train)

y_pred = modelo_treinadoXGB.predict(X_test)
modelo_treinadoXGB.score(X_test, y_test)



0.006775663793347597

In [239]:
# fazendo o treinamento, com método de regressão, utilizando LightGBM
modelo_treinadoLGBM = LGBMRegressor(random_state=0).fit(X_train, y_train)

y_pred = modelo_treinadoLGBM.predict(X_test)
modelo_treinadoLGBM.score(X_test, y_test)

-0.12110173464174445

In [240]:
df_trip.dtypes

media_galerias_arte         float64
media_baladas               float64
media_loja_sucos            float64
media_restaurantes          float64
media_museus                float64
media_resorts               float64
media_parques               float64
media_praias                float64
media_teatros               float64
media_templos_religiosos    float64
dtype: object

In [241]:
# Realizando a cópia para futuramente utilizar com Pipelines
df_trip_copy = df_trip.copy()

In [242]:
# Estabelecendo os critérios para treinamento
X_train, X_test, y_train, y_test = train_test_split(df_trip_copy.drop('media_baladas', axis = 1),
                                                    df_trip_copy['media_baladas'],
                                                    test_size = 0.25,
                                                    random_state=20)

In [243]:
# pipeline de machine learning utilizando método de regressão SVR

pipe = Pipeline([('encoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(X_train)),
                 ('modelo',SVR())]).fit(X_train, y_train)

y_pred = pipe.predict(X_test)
display(f'Resultados de y_pred: {y_pred}')

display(f'Passos do pipeline: ')
pipe

'Resultados de y_pred: [1.34350764 1.63986468 1.15857143 1.1576136  1.330499   1.37321962\n 1.1366297  1.28572964 1.35533655 1.34274644 1.19683345 1.3129977\n 1.54747379 1.45274114 1.1637225  1.29193167 1.20604899 1.43525374\n 1.52747811 1.36501549 1.30901446 1.33361208 1.35810282 1.22055195\n 1.21118358 1.33785545 1.19586307 1.34154352 1.32269182 1.47883694\n 1.47508237 1.27483132 1.15633688 1.37514916 1.40096613 1.47598204\n 1.19595311 1.13722099 1.28885328 1.25058748 1.37530216 1.40690525\n 1.47625633 1.23495971 0.82759907 1.34101317 1.80173822 1.26360305\n 1.63580639 1.50122328 1.32171395 1.33968758 1.64316422 1.22468002\n 1.3351194  1.27015047 1.37658684 0.73570231 1.32880954 1.42481935\n 1.25214855 1.47663935 1.40821496 1.31532237 1.10538719 1.14086289\n 1.71849374 1.23699214 1.29039353 1.47075316 1.2373448  1.24896778\n 1.27711303 1.23079426 1.41094529 1.27581986 1.00430316 1.25788724\n 1.40307961 1.09225554 1.10996234 1.13906715 1.47663935 1.25432829\n 1.28441891 1.52084051 1.3

'Passos do pipeline: '

In [244]:
# pipeline de machine learning utilizando método de regressão LightGBM
pipeLGBM = Pipeline([('encoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(X_train)),
                 ('modelo',LGBMRegressor(random_state=0))]).fit(X_train, y_train)

y_predLGBM = pipeLGBM.predict(X_test)
display(f'Resultados de y_pred: {y_predLGBM}')

display(f'Passos do pipeline: ')
pipeLGBM

'Resultados de y_pred: [1.42438732 1.47717391 1.55087851 1.06265341 1.53275038 1.70177006\n 0.99048047 1.53344704 1.2093025  1.63206675 1.14000724 1.23679332\n 1.45118632 1.64201557 1.17678651 1.37938549 1.57681284 1.51854901\n 1.75470199 1.31341583 1.69619987 1.43205167 1.45471356 1.25974604\n 1.33029444 1.39797726 1.21682071 1.14824011 1.27631647 1.64775581\n 1.79194122 1.00189586 1.38081661 1.09095302 1.01493454 1.90021084\n 1.02707344 1.60646353 1.02671736 1.05794219 1.60072375 1.40974962\n 1.37710739 1.57090905 0.94025377 1.53780961 1.77604307 1.59568661\n 1.6826068  1.44762516 1.27037644 1.77825115 2.03181973 1.37208239\n 1.44770891 1.3273409  1.53289163 0.69319317 1.51236356 1.39130823\n 1.2955316  1.49155002 1.84448025 1.20488327 1.09444679 1.25142222\n 2.07833965 1.30591309 1.15087636 1.8418161  1.3202282  1.26446131\n 1.91657459 1.12772569 1.53530974 1.50144471 0.98353128 1.48346104\n 1.36635249 0.9593392  0.96571314 1.49478271 1.49155002 1.20751612\n 1.14714315 1.64880993 1.

'Passos do pipeline: '

In [245]:
# pipeline de machine learning utilizando método de regressão XGBoost
pipeXGB = Pipeline([('encoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(X_train)),
                 ('modelo',XGBRegressor(random_state=0))]).fit(X_train, y_train)

y_predXGB = pipeXGB.predict(X_test)
display(f'Resultados de y_pred: {y_predXGB}')

display(f'Passos do pipeline: ')
pipeXGB



'Resultados de y_pred: [1.3927739  1.3666604  1.4729316  1.1866522  1.124841   1.3874257\n 1.232674   1.4008719  1.378073   1.3686798  1.0950109  1.0233557\n 1.7402517  1.483839   1.208427   1.3798251  1.3453659  1.4284309\n 1.6330906  1.4134574  1.4412712  1.4024365  1.5054845  1.1983298\n 1.2850056  1.3761592  1.3091954  1.2975625  1.5865403  1.4492383\n 1.4021163  1.172125   1.2529316  1.3629118  1.1333883  1.614063\n 1.3358885  1.3083665  1.3085473  1.2784929  1.4974785  1.3308506\n 1.3985131  1.2718908  0.97000283 1.306051   1.9132965  1.2941861\n 1.5125751  1.4096992  1.5666869  1.5228467  1.8659455  1.3447509\n 1.3307532  1.2668275  1.525333   0.9383223  1.4643165  1.3911098\n 1.1837276  1.2702774  1.5689962  1.2541311  1.1702602  1.2313524\n 1.7419174  1.4513795  1.1845527  1.6991892  1.2249949  1.4097784\n 1.7791533  1.2704467  1.394783   1.3341757  1.2093046  1.349259\n 1.5494936  1.2905626  1.2245396  1.1620507  1.2702774  1.3964088\n 1.3901181  1.2523282  1.4949766  1.24538

'Passos do pipeline: '

In [246]:
# Demonstração de resultados
display(f"Resultados para o primeiro pipeline: {pipe.score(X_test, y_test)}")
display(f"Resultados para o terceiro pipeline: {pipeLGBM.score(X_test, y_test)}")
display(f"Resultados para o terceiro pipeline: {pipeXGB.score(X_test, y_test)}")


'Resultados para o primeiro pipeline: -0.012219521741097195'

'Resultados para o terceiro pipeline: -0.06500319308878133'

'Resultados para o terceiro pipeline: -0.002167916873679898'