In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Preprocessing

### read data

In [None]:
pd.options.mode.chained_assignment = None 
data = pd.read_csv("../input/disneyland-reviews/DisneylandReviews.csv",encoding='latin-1')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

### remove 'missing', 'Year_Month' -> 'Year', 'Month'

In [None]:
data = pd.read_csv("../input/disneyland-reviews/DisneylandReviews.csv",encoding='latin-1', na_values = 'missing')

In [None]:
data.info()

In [None]:
data.dropna(inplace = True)

In [None]:
data.info()

In [None]:
date = data["Year_Month"].str.split("-", n = 1, expand = True)
data["Year"] = date[0]
data["Month"] = date[1]

In [None]:
data

In [None]:
data.drop(["Year_Month"], axis = 1, inplace = True)

In [None]:
data.head()

In [None]:
data = data.astype({'Year': int , 'Month': int})

In [None]:
data.info()

#### remove duplicate

In [None]:
data["Review_ID"].nunique()

In [None]:
data["Review_ID"].value_counts()

In [None]:
data.drop_duplicates('Review_ID', inplace=True, keep='first')

In [None]:
data.info()

In [None]:
data["Review_ID"].value_counts()

In [None]:
data.drop(["Review_ID"], axis = 1, inplace = True)

In [None]:
data.head()

### "Branch": 'Disneyland_HongKong' -> 'HongKong'

In [None]:
data["Branch"].value_counts()

In [None]:
data.head()

In [None]:
data["Branch"] = data["Branch"].str.split("_", n = 1, expand = True)[1]

In [None]:
data

### Preprocessing 'Review_Text'

In [None]:
data.reset_index(inplace=True)

In [None]:
data.shape

In [None]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import re

In [None]:
data['Review_Text'][3]

In [None]:
# english X -> '' (new_review1)
new_review1 = []
for i in range(len(data['Review_Text'])):
    review1=re.sub('[^a-zA-Z]', ' ', data['Review_Text'][i])
    new_review1.append(review1)

In [None]:
new_review1[3]

In [None]:
# remove space (new_review2)
new_review2 = []
for i in range(len(new_review1)):
    review2=new_review1[i].strip()
    new_review2.append(review2)

In [None]:
new_review2[3]

In [None]:
# lower (new_review3)
new_review3 = []
for i in range(len(new_review2)):
    review3=new_review2[i].lower()
    new_review3.append(review3)

In [None]:
new_review3[3]

In [None]:
# remove stopword, punctuation (new_review4)
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation
name = ['disney', 'disneyland', 'iron', 'ironman', 'mickey', 'buz', 'hk', 'california', 'paris', 'hongkong']

new_review4 = []

for i in new_review3:
    sentence = i.split()
    s_p=[]
    for cor in sentence:
        if cor not in stop_words and cor not in punctuation and cor not in name:
            s_p.append(cor)
    s_p=" ".join(s_p)    
    new_review4.append(s_p)

In [None]:
new_review4[3]

In [None]:
# stemming (new_review5)
snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

for stemmer in (snowball, lancaster, porter):
    new_review5 = [stemmer.stem(t) for t in new_review4]

In [None]:
new_review5[3]

In [None]:
# lemmatize (new_review6)
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

n=WordNetLemmatizer()
new_review6=[]
for i in range(len(new_review5)):
    words=word_tokenize(new_review5[i])
    lem_word=[]
    for w in words:
        lemm=n.lemmatize(w)
        lem_word.append(lemm)
    ps_tg=pos_tag(lem_word)
    new_review6.append(ps_tg)

In [None]:
new_review6[3]

In [None]:
# extracting (new_review7)
new_review7=[]
for i in new_review6:
    a=[]
    for j in i:
        if (j[1]=='NN' or j[1]=='NNP' or j[1]=='NNS' or j[1]=='NNPS' or j[1]=='VB'or j[1]=='VBD'or j[1]=='VBG'or j[1]=='VBN'or j[1]=='VBP'or j[1]=='VBZ'or j[1]=='VH'or j[1]=='VHD'or j[1]=='VHG'or j[1]=='VHN'or j[1]=='VHP'or j[1]=='VHZ'or j[1]=='VV'or j[1]=='VVD'or j[1]=='VVG'or j[1]=='VVN'or j[1]=='VVP'or j[1]=='VVZ'or j[1]=='JJ'or j[1]=='JJR'or j[1]=='JJS'or j[1]=='RB'or j[1]=='RBR'or j[1]=='RBS'):
            a.append(j[0])
    new_review7.append(a)

In [None]:
new_review7[3]

In [None]:
# join (text)
text=[]
for i in new_review7 :
    lem_join=" ".join(i)
    text.append(lem_join)

In [None]:
text[3]

In [None]:
data["review"] = text

In [None]:
data

## EDA  & Visualization

### Rating

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
%matplotlib inline

In [None]:
colors = ['#A8A7A8', '#CC527A',  '#363636', '#E8175D', '#474747' ]

In [None]:
# star distribution
star = data["Rating"].value_counts()

plt.pie(x = star, autopct='%1.1f%%', textprops=dict(color='white'), colors = colors)
plt.legend(labels = star.index)

plt.show()

In [None]:
star

In [None]:
img1 = np.array(Image.open("../input/image/1.png"))
img2 = np.array(Image.open("../input/image/2.png"))
img3 = np.array(Image.open("../input/image/3.png"))
img4 = np.array(Image.open("../input/image/4.png"))
img5 = np.array(Image.open("../input/image/5.png"))

In [None]:
# 1
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img1, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating == 1].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# 2
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img2, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating == 2].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# 3
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img3, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating == 3].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# 4
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img4, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating == 4].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# 5
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img5, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating == 5].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

### star by branch

In [None]:
colors = ['#A8A7A8', '#CC527A',  '#363636', '#E8175D', '#474747' ]

In [None]:
data["Branch"].value_counts()

In [None]:
# piechart
plt.pie(x = data["Branch"].value_counts(),  
        autopct='%1.1f%%', textprops=dict(color='white'), colors = colors)
plt.legend(data["Branch"].value_counts().index, loc='upper right', fontsize = "small")
plt.show()

In [None]:
# boxplot
sns.set_palette(colors)
sns.boxplot(y = data['Rating'], x = data['Branch'])

In [None]:
# piechart: star_branch
hong = data.loc[data["Branch"] == "HongKong"]
hong_star = hong['Rating'].value_counts()

cal = data.loc[data["Branch"] == "California"]
cal_star = cal['Rating'].value_counts()

par = data.loc[data["Branch"] == "Paris"]
par_star = par['Rating'].value_counts()

fig, ax = plt.subplots(1, 3, figsize=(20, 7))

ax[0].pie(x = hong_star, labels = hong_star.index, autopct='%1.2f%%', textprops=dict(color='white'), colors = colors)
ax[0].legend(hong_star.index, loc='upper right', fontsize = "small")
ax[0].set_title("Hongkong")

ax[1].pie(x = cal_star, labels = cal_star.index, autopct='%1.2f%%', textprops=dict(color='white'), colors = colors)
ax[1].legend(cal_star.index, loc='upper right', fontsize = "small")
ax[1].set_title("California")

ax[2].pie(x = par_star, labels = par_star.index, autopct='%1.2f%%', textprops=dict(color='white'), colors = colors)
ax[2].legend(par_star.index, loc='upper right', fontsize = "small")
ax[2].set_title("Paris")

In [None]:
# average
branch_star = data.groupby('Branch', as_index=False).agg({'Rating':'mean'})

x = branch_star['Branch']
y = branch_star['Rating']

plt.plot(x, y, color = 'palevioletred')
plt.ylim(3,5)
for i, v in enumerate(x):
    plt.text(v, y[i], round(y[i],2),
             fontsize = 11,
             horizontalalignment='center',
             verticalalignment='bottom') 
plt.show

In [None]:
branch_star

#### wordcloud

In [None]:
img_hong = np.array(Image.open("../input/image/hong.jpg"))
img_par = np.array(Image.open("../input/image/par.jpg"))
img_cal = np.array(Image.open("../input/image/cal.jpg"))

In [None]:
# Hongkong
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img_hong, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Branch == 'HongKong'].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# California
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img_cal, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Branch == 'California'].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# Paris
plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img_par, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Branch == 'Paris'].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

### Reviewer Location

In [None]:
import plotly.express as px

In [None]:
data["Reviewer_Location"].nunique()

In [None]:
# average
star_loc = data.groupby('Reviewer_Location', as_index=False).agg({'Rating':'mean'}).sort_values('Rating', ascending=False)

In [None]:
star_loc

In [None]:
# visualization
fig = px.choropleth(star_loc, 
                    locations = 'Reviewer_Location', 
                    locationmode = 'country names', 
                    color = 'Rating',
                    hover_data = ['Rating'], 
                    title = 'Country - Star')
fig.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 7))

hong_loc = data[data["Branch"] == "HongKong"]["Reviewer_Location"].value_counts()[:5]
cal_loc = data.loc[data["Branch"] == "California"]["Reviewer_Location"].value_counts()[:5]
par_loc = data.loc[data["Branch"] == "Paris"]["Reviewer_Location"].value_counts()[:5]

ax[0].bar(hong_loc.index, hong_loc.values, color = colors)
ax[0].set_ylabel('Number of visits')
ax[0].set_title("HongKong")
ax[0].tick_params(axis = 'x', rotation = 60)

ax[1].bar(cal_loc.index, cal_loc.values, color = colors)
ax[1].set_title("California")
ax[1].tick_params(axis = 'x', rotation = 60)

ax[2].bar(par_loc.index, par_loc.values, color = colors)
ax[2].set_title("Paris")
ax[2].tick_params(axis = 'x', rotation = 60)

### Year_Month

#### year

In [None]:
year_review = data.groupby('Year', as_index=False).agg({'Rating':'count'})

In [None]:
year_review

In [None]:
x = year_review['Year']
y = year_review['Rating']
plt.bar(x, y, color = 'palevioletred')
plt.xticks(x)
for i, v in enumerate(x):
    plt.text(v, y[i], y[i],
             fontsize = 9,
             horizontalalignment='center',
             verticalalignment='bottom') 
plt.show

In [None]:
sns.countplot(data = data, x = 'Year', hue = 'Rating')

In [None]:
# average
year_star = data.groupby('Year', as_index=False).agg({'Rating':'mean'})

x = year_star['Year']
y = year_star['Rating']

plt.plot(x, y, color = 'palevioletred')
plt.xticks(x)
for i, v in enumerate(x):
    plt.text(v, y[i], round(y[i],2),
             fontsize = 9,
             horizontalalignment='center',
             verticalalignment='bottom') 
plt.show

#### Month

In [None]:
month_review = data.groupby('Month', as_index=False).agg({'Rating':'count'})

In [None]:
x = month_review['Month']
y = month_review['Rating']
plt.bar(x, y, color = 'palevioletred')
plt.xticks(x)
for i, v in enumerate(x):
    plt.text(v, y[i], y[i],
             fontsize = 9,
             horizontalalignment='center',
             verticalalignment='bottom') 
plt.show

In [None]:
sns.countplot(data = data, x = 'Month', hue = 'Rating')

In [None]:
# average
month_star = data.groupby('Month', as_index=False).agg({'Rating':'mean'})

x = month_star['Month']
y = month_star['Rating']

plt.plot(x, y, color = 'palevioletred')
plt.xticks(x)
for i, v in enumerate(x):
    plt.text(v, y[i], round(y[i],2),
             fontsize = 9,
             horizontalalignment='center',
             verticalalignment='bottom') 
plt.show

## Sentiment Analysis

### wordcloud

In [None]:
img_bad = np.array(Image.open("../input/image/bad.png"))
img_good = np.array(Image.open("../input/image/good.png"))
img_soso = np.array(Image.open("../input/image/soso.png"))

In [None]:
# 1~2: negative

plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img_bad, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating <= 3].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# 4~5: positive

plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img_good, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating >= 3].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
# 3: neutral

plt.figure(figsize = (10,10))
Wc = WordCloud(mask = img_soso, background_color='white',
               max_words = 1000 , width = 500 , height = 400, min_word_length = 5, 
               contour_width = 1, contour_color = 'black', colormap = "RdPu").generate(" ".join(data[data.Rating == 3].review))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear')

In [None]:
S_Polarity = []
for i in data["Rating"]:
    if i > 3:
        S_Polarity.append('pos')
    elif i < 3:
        S_Polarity.append('neg')
    else:
        S_Polarity.append('neu')

In [None]:
data["S_Polarity"] = S_Polarity

In [None]:
data

In [None]:
star = data["S_Polarity"].value_counts()

plt.pie(x = star,
       autopct='%1.1f%%', textprops=dict(color='white'))
plt.legend(labels = star.index)

plt.show()

### lexicon based

#### vader

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_analyzer=SentimentIntensityAnalyzer()

In [None]:
data['Rating'][6]

In [None]:
data['review'][6]

In [None]:
v_scores = []

for i in data['review']:
    score = senti_analyzer.polarity_scores(i)
    v_scores.append(score['compound'])

In [None]:
v_scores[6]

In [None]:
data['vader'] = v_scores

In [None]:
data.groupby("Rating")["vader"].describe()

In [None]:
V_Polarity = []

for i in v_scores:
    if i > 0:
        V_Polarity.append('pos')
    elif i < 0:
        V_Polarity.append('neg')
    else:
        V_Polarity.append('neu')

In [None]:
V_Polarity[6]

In [None]:
data["V_Polarity"] = V_Polarity

In [None]:
data

In [None]:
v_pol = data["V_Polarity"].value_counts()

plt.pie(x = v_pol,
       autopct='%1.1f%%', textprops=dict(color='white'))
plt.legend(labels = v_pol.index)

plt.show()

### Decision Tree Classifier, Random Forest Classifier

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#### Star

In [None]:
x = data['review']
y = data['S_Polarity']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2)

print(x_train.shape, x_test.shape) 
np.unique(y_train, return_counts=True) 

In [None]:
stop_words = stopwords.words('english')

vect = TfidfVectorizer(stop_words=stop_words).fit(x_train)
x_train_vectorized = vect.transform(x_train)

x_train_vectorized

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth = 100, random_state = 42)
dtc.fit(x_train_vectorized, y_train)
print(dtc.score(x_train_vectorized, y_train))
print(dtc.score(vect.transform(x_test), y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth = 100, random_state = 42)
rfc.fit(x_train_vectorized, y_train)
print(rfc.score(x_train_vectorized, y_train))
print(rfc.score(vect.transform(x_test), y_test))

#### vader

In [None]:
x = data['review']
y = data['V_Polarity']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y,test_size=0.2)

print(x_train.shape, x_test.shape) 
np.unique(y_train, return_counts=True) 

In [None]:
stop_words = stopwords.words('english')

vect = TfidfVectorizer(stop_words=stop_words).fit(x_train)
x_train_vectorized = vect.transform(x_train)

x_train_vectorized

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth = 100, random_state = 42)
dtc.fit(x_train_vectorized, y_train)
print(dtc.score(x_train_vectorized, y_train))
print(dtc.score(vect.transform(x_test), y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=100, random_state = 42)
rfc.fit(x_train_vectorized, y_train)
print(rfc.score(x_train_vectorized, y_train))
print(rfc.score(vect.transform(x_test), y_test))