In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Data 

In [None]:
df= pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
df.head()

In [None]:
df.isna().sum()

In [None]:
df.shape

# Some Analysis

In [None]:
sns.countplot(x= df['Rating'])

In [None]:
df['Word_count']= df['Review'].map(lambda x: len(x.split()))

In [None]:
df.head()

In [None]:
sns.lineplot(x='Rating', data=df, y='Word_count')

**Ingisht**
* Higher Rated Reviews tend to have less words while, lower rated reviews have very high word count

In [None]:
from textblob import TextBlob

In [None]:
def polarity(text):
    blob= TextBlob(text)
    blob.sentiment
    polarity= blob.sentiment.polarity
    
    return polarity

def subjectivity(text):
    blob= TextBlob(text)
    blob.sentiment
    subjectivity= blob.sentiment.subjectivity
    
    return subjectivity

In [None]:
df['Polarity']= df['Review'].apply(polarity)
df['Subjectivity']= df['Review'].apply(subjectivity)
df

In [None]:
sns.distplot(df['Polarity'])

In [None]:
sns.distplot(df['Subjectivity'])

In [None]:
sns.boxplot(x='Rating',y='Polarity', data=df, whis=2.5, fliersize= 5)

In [None]:
sns.boxplot(x='Rating',y='Subjectivity', data=df, whis=2 )

# Preprocessing the Data using SpaCy

In [None]:
import spacy

In [None]:
nlp= spacy.load('en_core_web_sm')

In [None]:
def preprocess(text):
    lower= text.lower()
    doc= nlp(lower)
    tokens= [token.lemma_ for token in doc ]
    a_lemma= [lemma for lemma in tokens if lemma not in spacy.lang.en.stop_words.STOP_WORDS and lemma.isalpha()]
    return " ".join(a_lemma)

In [None]:
df['Review_new']= df['Review'].apply(preprocess)

In [None]:
df

In [None]:
rev= " ".join([review for review in df['Review_new']])
rev[:2000]

# WordCloud of 200 most occuring Words

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(15,10))
wc= WordCloud(max_words=200,height= 800, width=1000 ,background_color='black').generate(rev)
plt.imshow(wc)

In [None]:
def sentiment(review):
    if review>=3:
        return 1
    else:
        return 0
df['Sentiment']= df['Rating'].apply(sentiment)

In [None]:
df

# Creating our Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk import word_tokenize

In [None]:
X= df['Review_new']
y= df['Sentiment']

X_train, X_test,y_train, y_test= train_test_split(X, y, test_size=0.25, stratify=y)

In [None]:
X_train

In [None]:
y_train

In [None]:
tfidf= TfidfVectorizer(max_features=10000, tokenizer= word_tokenize,ngram_range=(1,2) )
X_train_transformed= tfidf.fit_transform(X_train.values)
X_test_transformed= tfidf.transform(X_test.values)

In [None]:
X_train_transformed.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc= RandomForestClassifier()
rfc.fit(X_train_transformed, y_train)
y_pred= rfc.predict(X_test_transformed)

rfc.score(X_test_transformed, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression

lr= LogisticRegression()
lr.fit(X_train_transformed, y_train)
y_pred= lr.predict(X_test_transformed)

lr.score(X_test_transformed, y_test)