# Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [None]:
data = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data['Rating'].value_counts()

In [None]:
plt.hist(data['Rating'],color='aqua')
plt.show()


In [None]:
data.head()

**Text Length**

In [None]:
data['text length'] = data['Review'].apply(len)
data.head()

# EDA

**Histograms: text length based off of the star ratings**

In [None]:
plt.figure(figsize=(10,6))
g = sns.FacetGrid(data,col='Rating')
g.map(plt.hist,'text length')

**Box plot : Rating/Text Length**

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Rating',y='text length',data=data , palette='rainbow')

**Count plot: Ratings**

In [None]:
sns.countplot(x='Rating',data=data , palette='rainbow')

In [None]:
Ratings = pd.DataFrame(data.groupby('Rating').mean())

In [None]:
Ratings

In [None]:
from wordcloud import WordCloud ,STOPWORDS

In [None]:
def wordCloud_generator(data, title=None):
    wordcloud = WordCloud(width = 800, height = 800,
                          background_color ='black',
                          min_font_size = 10
                         ).generate(" ".join(data.values))
    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title,fontsize=30)
    plt.show() 


In [None]:
wordCloud_generator(data['Review'], title="Most used words in reviews")

# Data Cleaning and Preprocessing

**Importing Relevant Libraries**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
data.shape

In [None]:
ps = PorterStemmer()
corpus=[]
for i in range(0,20491):
    review = re.sub('[^a-zA-Z]', ' ' , data['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Bag of Words Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(corpus).toarray()

In [None]:
y = data['Rating']

# Data Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2 , random_state = 666)

#  Model Building & Evaluation

**You can either choose to pipeline multiple classifiers or better try to perform tasks on different classifiers separately to get a "wholesome" idea of how each classifier did (xD).**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

**Logistic Regression Classifier**

In [None]:
lr = LogisticRegression(max_iter=10005)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

**Random Forest Classfier**

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

**LGBM Classifier**

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(X_train,y_train)
y_pred = lgbm.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

**XgBoost Classifier**

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# What's Next ?


**These are just baseline models with accuracy just above average .
what you can do to achieve better performance :**
* 1)Perform over/under sampling of data since the rankings are highly skewed to the right (rating = 5 has the highest value count)
* 2)For CountVectorizer , use a large number of max features  or better study bi-gram and tri-gram and tweak the data accordingly
* 3)Use tf-idf
* 4)Use a different model or pipeline multiple models to get an overview and tune the models .

# Upvote my work.
**If you somehow came across this notebook and find it useful , please do upvote it or better drop a comment , would love to get in touch with y'all beautiful people and talk  more about what data can do for us**

**P.s - Haha i know my documentation sucks xD**