In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import string as str
import re
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, accuracy_score

In [None]:
data = pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

In [None]:
data.head()

**data cleaning**

In [None]:
# lets first clean the data
# change it to a list
reviews = data['Review'].values

In [None]:
# change to lower case
reviews = [review.lower() for review in reviews]

In [None]:
reviews[1:2]

In [None]:
# remove urls
reviews = [re.sub('\w+://\S+', '', review) for review in reviews]

**tokenize the data**

In [None]:
# lets tokenize the reviews
from nltk.tokenize import word_tokenize
reviews_tokens = [word_tokenize(review) for review in reviews]

In [None]:
# remove stopwords and punctuations
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)
stop_nltk.remove("no")
stop_nltk.remove("not")
stop_nltk.remove("don")
stop_nltk.remove("won")

stop_final = stop_nltk + stop_punct

In [None]:
# define a method to remove stop words and punctuations
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [None]:
reviews_clean = [del_stop(sent) for sent in reviews_tokens] 

In [None]:
# joining back the words to sentences
reviews_clean = [" ".join(sent) for sent in reviews_clean]

In [None]:
data['new_reviews'] = reviews_clean
data.head()

**split the data into train and test**

In [None]:
# create X and y and split the data for training and testing purpose
X = data.new_reviews
y = data.Rating

In [None]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

**vectorize the data**

In [None]:
# vectorize the data using Tfidfvectorizer
vectorizer = TfidfVectorizer(max_features=5000)

In [None]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
print(X_train_vec.shape)
print(X_test_vec.shape)

**Model Building**

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
rf = RandomForestRegressor(max_depth = 5, random_state=42)

In [None]:

rf.fit(X_train_vec, y_train)

In [None]:
rf_pred = rf.predict(X_test_vec)
print(rf_pred)
print('mean squared error for random forest regressor with tree depth=5: ', mean_squared_error(rf_pred, y_test)**0.5)

**increase the depth of tree for random forest regressor**

In [None]:
rf_30 = RandomForestRegressor(max_depth = 30, random_state=42)

In [None]:

rf_30.fit(X_train_vec, y_train)

In [None]:
rf_30_pred = rf_30.predict(X_test_vec)
print(rf_30_pred)
print('mean squared error for random forest regressor with tree depth: 30: ', mean_squared_error(rf_30_pred, y_test)**0.5)

In [None]:
rf_50 = RandomForestRegressor(max_depth = 50, random_state=42)
rf_50.fit(X_train_vec, y_train)

In [None]:
rf_50_pred = rf_50.predict(X_test_vec)
print(rf_50_pred)
print('mean squared error for random forest regressor with tree depth=50: ', mean_squared_error(rf_50_pred, y_test)**0.5)

In [None]:
rf_n_est_20 = RandomForestRegressor(max_depth = 20, random_state=42, n_estimators=20)
rf_n_est_20.fit(X_train_vec, y_train)

**we see that the mean squared error improves by increasingt the tree depth**

In [None]:
rf_n_est_20_pred = rf_n_est_20.predict(X_test_vec)
print(rf_n_est_20_pred)
print('mean squared error for random forest regressor with tree depth=50: ', mean_squared_error(rf_n_est_20_pred, y_test)**0.5)

**hyper parameter tuning using GridSearchCV**

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 25]
}

In [None]:
rf_new = RandomForestRegressor(random_state=42)

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf_new, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )
grid_search.fit(X_train_vec, y_train)

In [None]:
grid_search_pred = grid_search.predict(X_test_vec)

In [None]:
grid_search.best_estimator_

In [None]:
grid_best_est_pred = grid_search.best_estimator_.predict(X_test_vec)
print('mean squared error with hypertuning parameters: ', mean_squared_error(y_test, grid_best_est_pred ) ** 0.5)

****identify mismatches ****

In [None]:
res_df = pd.DataFrame({'review':X_test, 'rating':y_test, 'rating_pred':grid_best_est_pred})
res_df[(res_df.rating - res_df.rating_pred) > 2]