In [166]:
# Importando as bibliotecas que serão utilizadas
import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport
from sklearn.impute import KNNImputer

from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split

In [150]:
# importando o dataset e obtendo uma vista geral do mesmo
df_trip = pd.read_csv('tripadvisor_reviews.csv', sep=';')

In [151]:
# Apagando colunas que não são importantes
df_trip = df_trip.drop(columns=['usuario'], axis = 1)

In [152]:
# Checando a tipagem do dos dados
df_trip.dtypes

media_galerias_arte         object
media_baladas               object
media_loja_sucos            object
media_restaurantes          object
media_museus                object
media_resorts               object
media_parques               object
media_praias                object
media_teatros               object
media_templos_religiosos    object
dtype: object

In [153]:
# Substituindo virgulas por ponto para poder trabalhar com o dataset, outra opção seria alterar o local
df_trip = df_trip.apply(lambda x: x.str.replace(',','.'))

In [154]:
# Verificando quantidade de linhas e colunas
nRow, nCol = df_trip.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 980 rows and 10 columns


In [155]:
# Alterar o tipo de dados, pois estavam como objeto
df_trip = df_trip.astype(float)

In [156]:
# Conferindo se foi realizada a alteração com sucesso
df_trip.dtypes

media_galerias_arte         float64
media_baladas               float64
media_loja_sucos            float64
media_restaurantes          float64
media_museus                float64
media_resorts               float64
media_parques               float64
media_praias                float64
media_teatros               float64
media_templos_religiosos    float64
dtype: object

In [157]:
# Algumas informações essenciais do dataset
df_trip.describe()

Unnamed: 0,media_galerias_arte,media_baladas,media_loja_sucos,media_restaurantes,media_museus,media_resorts,media_parques,media_praias,media_teatros,media_templos_religiosos
count,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0
mean,0.893194,1.352612,1.013306,0.5325,0.939735,1.842898,3.180939,2.835061,1.569439,2.799224
std,0.326912,0.47828,0.788607,0.279731,0.43743,0.539538,0.007824,0.137505,0.364629,0.32138
min,0.34,0.0,0.13,0.15,0.06,0.14,3.16,2.42,0.74,2.14
25%,0.67,1.08,0.27,0.41,0.64,1.46,3.18,2.74,1.31,2.54
50%,0.83,1.28,0.82,0.5,0.9,1.8,3.18,2.82,1.54,2.78
75%,1.02,1.56,1.5725,0.58,1.2,2.2,3.18,2.91,1.76,3.04
max,3.22,3.64,3.62,3.44,3.3,3.76,3.21,3.39,3.17,3.66


In [158]:
# Verificando se existem dados nulos
df_trip[pd.isna(df_trip).any(axis=1)]

Unnamed: 0,media_galerias_arte,media_baladas,media_loja_sucos,media_restaurantes,media_museus,media_resorts,media_parques,media_praias,media_teatros,media_templos_religiosos


In [159]:
# Estabelecendo os critérios para treinamento

X_train, X_test, y_train, y_test = train_test_split(df_trip.drop('media_baladas', axis = 1),
                                                    df_trip['media_baladas'],
                                                    test_size = 0.25,
                                                    random_state=20)

# fazendo o treinamento, com método de regressão
modelo_treinado = SVR().fit(X_train, y_train)

y_pred = modelo_treinado.predict(X_test)

In [160]:
modelo_treinado.score(X_test, y_test)

0.08870368685145003

In [123]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [124]:
# fazendo o treinamento, com método de regressão, utilizando XGBoost
modelo_treinadoXGB = XGBRegressor(random_state=0).fit(X_train, y_train)

y_pred = modelo_treinadoXGB.predict(X_test)
modelo_treinadoXGB.score(X_test, y_test)



0.14975787725067824

In [125]:
# fazendo o treinamento, com método de regressão, utilizando LightGBM
modelo_treinadoLGBM = LGBMRegressor(random_state=0).fit(X_train, y_train)

y_pred = modelo_treinadoLGBM.predict(X_test)
modelo_treinadoLGBM.score(X_test, y_test)

0.05071547188561387