<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Libraries And Utilities</h3>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
from wordcloud import WordCloud,STOPWORDS
stopwords = list(STOPWORDS)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score 
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer as CVTZ
def set_seed(seed=31415):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

def RMSE(Y,YHAT):
    return np.sqrt(mean_squared_error(Y,YHAT))

plt.rc('figure',figsize=(20,11))

<a id="1.2"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Data Importation And Missing Value Assessment</h3>


In [None]:
t_data = pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
t_data.head(3)

In [None]:
sns.heatmap(t_data.isna().sum().to_frame(),annot=True,cmap='mako')
plt.xlabel('Amount Missing',fontsize=15)
plt.show()

<a id="1.2"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Feature Engineering And Preprocessing</h3>


### Remove Stopwords From Reviews

In [None]:
def remove_stop_words(sir):
    splited = sir.split(' ')
    splited = [word for word in splited if word not in stopwords]
    return ' '.join(splited)

t_data.Review = t_data.Review.apply(remove_stop_words)

In [None]:
sid = SentimentIntensityAnalyzer()

def get_char_count(sir):
    return len(sir)
def get_word_count(sir):
    return len(sir.split(' '))
def get_average_word_length(sir):
    aux = 0
    for word in sir.split(' '):
        aux += len(word)
    return aux/len(sir.split(' '))
def get_pos_sentiment(sir):
    sent = sid.polarity_scores(sir)
    return sent['pos']
def get_neg_sentiment(sir):
    sent = sid.polarity_scores(sir)
    return sent['neg']
def get_neu_sentiment(sir):
    sent = sid.polarity_scores(sir)
    return sent['neu']

In [None]:
t_data['Char_Count'] =  t_data.Review.apply(get_char_count)
t_data['Word_Count'] =  t_data.Review.apply(get_word_count)
t_data['Average_Word_Length'] =  t_data.Review.apply(get_average_word_length)
t_data['Positive_Sentiment'] =   t_data.Review.apply(get_pos_sentiment)
t_data['Negative_Sentiment'] = t_data.Review.apply(get_neg_sentiment)
t_data['Neutral_Sentiment'] =t_data.Review.apply(get_neu_sentiment)

<a id="1.2"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Exploratory Data Analysis</h3>


In [None]:
word_list = ''
for word in t_data.Review:
    splited = word.lower()
    word_list +=splited
    
wordcloud = WordCloud(width=800,height=800,background_color='white',stopwords=stopwords,min_font_size=5).generate(word_list)
plt.figure(figsize = (25, 15), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
ex.box(t_data,x='Rating',y='Positive_Sentiment',notched=True,title='Rating Positive Sentiment Distributions')

### We see that the higher the average positive sentiment the higher the rating

In [None]:
ex.box(t_data,x='Rating',y='Negative_Sentiment',notched=True,title='Rating Positive Sentiment Distributions')

### And not surprisingly the higher the negative sentiment the lower the rating 

In [None]:
sns.pairplot(t_data)

In [None]:
sns.jointplot(x=t_data['Average_Word_Length'],y=t_data['Positive_Sentiment'],height=15,kind='kde',levels=20)

In [None]:
print('Average_Word_Length Skew: ',t_data['Average_Word_Length'].skew(),"  Average_Word_Length Kurtosis",t_data['Average_Word_Length'].kurt())

In [None]:
print('Average_Word_Length Mean: ',t_data['Average_Word_Length'].mean(),"  Average_Word_Length Median",t_data['Average_Word_Length'].median(),' Average_Word_Length Mode : ',t_data['Average_Word_Length'].mode()[0])

### The Average_Word_Length Is Approximately Normally Distributed

In [None]:
data_info = t_data.describe()
data_info.loc['skew'] = t_data.skew()
data_info.loc['kurt'] = t_data.kurt()
data_info

In [None]:
tout_l = t_data.copy()
tout_l['OLL'] = 'Normal'
tout_l.loc[tout_l[tout_l['Word_Count']>1000].index,'OLL']= 'Outlier'
tout_l.loc[tout_l[tout_l['Neutral_Sentiment']<0.25].index,'OLL']= 'Outlier'
tout_l.loc[tout_l[tout_l['Neutral_Sentiment']>0.98].index,'OLL']= 'Outlier'

ex.scatter_3d(tout_l,x='Rating',y='Neutral_Sentiment',z='Word_Count',color='OLL')

### Outlier Removal

In [None]:
t_data = t_data[t_data['Neutral_Sentiment']>0.25]
t_data = t_data[t_data['Neutral_Sentiment']<0.98]
t_data = t_data[t_data['Word_Count']<1000]

In [None]:
cors = t_data.corr('pearson')
plt.figure(figsize=(20,13))
sns.heatmap(cors,annot=True,cmap='mako')

In [None]:
t_data.head(3)

<a id="1.2"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:200%;text-align:center">Model Selection And Evaluation</h3>


### First Approach

In [None]:
train_x,test_x,train_y,test_y = train_test_split(t_data[['Positive_Sentiment','Negative_Sentiment','Average_Word_Length']],t_data['Rating'])

GN_Pipe = Pipeline(steps=[('model',GaussianNB())])
GN_Pipe.fit(train_x,train_y)
GN_predictions= GN_Pipe.predict(test_x)
#GN_predictions = np.round(LR_predictions)
cfm = confusion_matrix(GN_predictions,test_y)

plt.figure(figsize=(20,13))
plt.title('Naive Bayes Confusion Matrix',fontsize=20)
sns.heatmap(cfm,annot=True,cmap='mako',fmt='d',xticklabels=[1,2,3,4,5],yticklabels=[1,2,3,4,5])

In [None]:
print('accuracy: ',accuracy_score (LR_predictions,test_y))

In [None]:
DT_Pipe = Pipeline(steps=[('model',DecisionTreeClassifier())])
DT_Pipe.fit(train_x,train_y)
predictions= DT_Pipe.predict(test_x)
cfm = confusion_matrix(predictions,test_y)

plt.figure(figsize=(20,13))
plt.title('Decision Tree Confusion Matrix',fontsize=20)
sns.heatmap(cfm,annot=True,cmap='mako',fmt='d',xticklabels=[1,2,3,4,5],yticklabels=[1,2,3,4,5])

In [None]:
print('accuracy: ',accuracy_score (predictions,test_y))

### So far we see that using sentiments and basic text features we have no segnificant results, we will know try our second approch in which we will vectorize our text data and use our naive bayes model again to try and predict a reduced version of the vectorized text.

In [None]:
tf_model = CVTZ()
N_COMPONENTS = 900

svd_model = TruncatedSVD(n_components = N_COMPONENTS)
desc_matrix = tf_model.fit_transform(t_data.Review)
trunc_matrix = svd_model.fit_transform(desc_matrix)

evr = svd_model.explained_variance_ratio_
evr_cs = np.cumsum(evr)
tr1 = go.Scatter(x=np.arange(0,len(evr_cs)),y=evr_cs,name='Explained Variance Cumulative')
tr2 = go.Scatter(x=np.arange(0,len(evr_cs)),y=evr,name='Explained Variance')

fig = go.Figure(data=[tr1,tr2],layout=dict(title='Explained Variance Ratio Using {} Components'.format(N_COMPONENTS),
                                          xaxis_title='Number Of Components',yaxis_title='Explained Variance Ratio'))

fig.show()


In [None]:
dec_df = pd.DataFrame(trunc_matrix,columns=['PC_{}'.format(i) for i in range(0,900)])
dec_df.head(3)

In [None]:
train_x,test_x,train_y,test_y = train_test_split(dec_df,t_data['Rating'])

GN_Pipe = Pipeline(steps=[('model',GaussianNB())])
GN_Pipe.fit(train_x,train_y)
GN_predictions= GN_Pipe.predict(test_x)
cfm = confusion_matrix(GN_predictions,test_y)

plt.figure(figsize=(20,13))
plt.title('Naive Bayes Confusion Matrix',fontsize=20)
sns.heatmap(cfm,annot=True,cmap='mako',fmt='d',xticklabels=[1,2,3,4,5],yticklabels=[1,2,3,4,5])

In [None]:
print('accuracy: ',accuracy_score (GN_predictions,test_y))