# Import Libraries

In [None]:
import pandas as pd # dataframe manipulation
import numpy as np # linear algebra
#------------------data viz-----------------------
import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec 
%matplotlib inline
import seaborn as sns 
import plotly.express as px # 
import plotly.graph_objects as go # 
from statsmodels.graphics.gofplots import qqplot 
#_________________________________________________
import re # text data
import string

# Lets look at the data...

In [None]:
file_path = '../input/disneyland-reviews/DisneylandReviews.csv'
df = pd.read_csv(file_path, encoding = 'ISO-8859-1')

In [None]:
df.shape

In [None]:
df.isnull().sum()

## First five rows

In [None]:
df.head()

## Last five rows

- Although, there were no missing values initially, but, after looking as the last five rows of the dataset, the `Year_month` column consists of missing values passed as string. Therefore, reading the file again and specifying the na_values as `missing` 

In [None]:
df.tail()

## Reading data again and specifying na_values as 'missing'

In [None]:
df = pd.read_csv(file_path, encoding = 'ISO-8859-1', na_values = 'missing')
df.tail()

`Year_month` column consists of 6 percent missing values. These value are not dropped since the goal is to analyse the reviews. Removing these missing dates will result in loss of data.

In [None]:
df.isnull().sum()/len(df)*100

## Duplicate Reviews

The `Review_ID` column consits of dupicate id's

In [None]:
df.Review_ID.value_counts().head(21)

Further analysing the dupicate id indicate that they also contain the same information in other columns too.

In [None]:
df[df.Review_ID == 166787525]

In [None]:
df[df.Review_ID == 129231609]

## Droping duplicate records and keeping first

In [None]:
df.drop_duplicates(subset='Review_ID', inplace=True, keep='first')

In [None]:
df.shape

# EDA

## Number of reviews for the three Disneyland branches
Disneyland Califoria has the highest percentage of reviews (approx. 46%).

In [None]:
branch_count = df.Branch.value_counts()
branch_col = ['navy', 'crimson', 'forestgreen']
# remove extra characters from branch name
branch_name = [branch[11:] for branch in branch_count.index] 

with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    plt.subplots_adjust(wspace=0.3)
    # plot 1
    ax[0].bar(branch_name, 
              branch_count.values, 
              color=branch_col)
    for x , y, col in zip(branch_name, 
                     branch_count.values, branch_col):
        ax[0].text(x, y/2, y, 
                   ha='center',color='white', 
                   bbox=dict(facecolor=col, edgecolor='white', boxstyle='circle'))
    ax[0].set_ylabel('Number of Reviews')
    # plot 2
    ax[1].pie(x=branch_count.values, 
              #labels=branch_name,
              colors=branch_col,  
              autopct='%1.1f%%', textprops=dict(color='white'))
    ax[1].legend(labels=branch_name, loc='upper right', fontsize="xx-small")
    plt.show()

## Peak time to visit Disneyland
The peak time is computed by counting the reviews/visits in all the quaters for a repective disneyland branch. `Note` only data from 2014 - 2019 was included in this analysis, to analyse the current trend.

In [None]:
no_missing = df.dropna().reset_index() # drop missing dates
no_missing['Year'] = no_missing['Year_Month'].apply(lambda x: int(re.split('-', x)[0]))
no_missing['Month'] = no_missing['Year_Month'].apply(lambda x: int(re.split('-', x)[1]))
# computes quater using month number
no_missing['Quater'] = no_missing['Month'].apply(lambda x: (x-1)//3+1) 

Therefore, Quater 3 is the most busiest period to visit Disneyland for all the Disneyland branches.

In [None]:
# plot
branch_col = ['navy', 'crimson', 'forestgreen']
with plt.style.context('ggplot'):
    fig, ax = plt.subplots(1, 3, figsize=(12, 5), sharey=True)
    plt.subplots_adjust(top=0.8)
    for i, (branch, col) in enumerate(zip(no_missing.Branch.unique(), branch_col)):
        counts = no_missing[(no_missing.Branch == branch)|(no_missing.Year>=2014)]['Quater'].value_counts()
        x = counts.index
        y = counts.values
        ax[i].bar(x, y, color=col, label=branch)
        for q, val in zip(x, y):
            ax[i].text(q, val/2, val, ha='center', 
                       color='white', 
                       bbox=dict(facecolor=col, edgecolor='white', boxstyle="circle"))
        ax[i].set_xlabel('Quater')
        ax[0].set_ylabel('Number of visits')
        ax[i].set_title(branch.upper()[11:], color=col)
    fig.suptitle('Peak time to visit Disneyland (from 2014-2019 data)', fontsize=15, fontweight='semibold')
    fig.show()

In [None]:
fig = go.Figure()
for i, (branch, col) in enumerate(zip(no_missing.Branch.unique(), branch_col)):
    counts = no_missing[(no_missing.Branch == branch)|(no_missing.Year>=2014)]['Quater'].value_counts()
    x = counts.index
    y = counts.values
    fig.add_trace(go.Bar(x=x, y=y, name=branch[11:], marker_color=col, text=y))
fig.update_traces(textposition='inside')
fig.update_layout(barmode='group', xaxis_tickangle=-45, template='ggplot2')
fig.show()

## Major Groups visiting Disneyland branches

In [None]:
with plt.style.context('seaborn'):
    fig, ax = plt.subplots(1, 3, figsize=(12, 5))
    plt.subplots_adjust(top=0.8, wspace=0.3)
    for i, (branch, col) in enumerate(zip(df.Branch.unique(), branch_col)):
        # count the reviews for a particular disneyland branch
        loc_count = df[df.Branch == branch]['Reviewer_Location'].value_counts()[:5]
        # plot
        x = loc_count.index
        y = loc_count.values
        ax[i].bar(x, y, color=col)
        ax[0].set_ylabel('Number of visits')
        ax[i].set_title(branch.upper()[11:], color=col)
        ax[i].tick_params(axis='x', rotation=90)
        for c, val in zip(x, y):
            ax[i].text(c, val*1.01, val, ha='center', color='white',
                       bbox=dict(facecolor=col, 
                                 edgecolor='white', 
                                 boxstyle="circle", pad=0.5))
    fig.suptitle('Major groups visiting disneyland branches', 
                     fontsize=15, fontweight='semibold')
    fig.show()

## Distribution of Ratings

In [None]:
rating_count = df.Rating.value_counts()
with plt.style.context('ggplot'):
    plt.figure(figsize=(8, 6))
    plt.bar(rating_count.index, rating_count.values, color='teal')
    for r, val in zip(rating_count.index, rating_count.values):
        plt.text(r, val, 
                 str(round(val/sum(rating_count.values)*100, 2))+'%', 
                 ha='center', color='white', 
                 bbox=dict(facecolor='dimgrey', edgecolor='white', boxstyle="round"))
    plt.xlabel('Ratings')
    plt.ylabel('Number of reviewers')
    plt.yticks(np.arange(0, 30001, 5000))

In [None]:
markers = ['o', '*', 'D']
with plt.style.context('bmh'):
    plt.figure(figsize=(10, 8))
    for branch, m, col in zip(df.Branch.unique(), markers, branch_col):
        counts = df[df.Branch == branch]['Rating'].value_counts()
        x = counts.index
        y = counts.values
        plt.plot(x, y, marker=m, markersize=9, color=col, label=branch)
        plt.xticks(np.arange(1, 6))
        plt.xlabel('Ratings')
        plt.ylabel('Number of reviews')
    plt.title('Distribution of Ratings across disneyland branches')
    plt.legend()
    plt.show()

In [None]:
branch_ratings = df.groupby('Branch').agg({'Rating': 'mean'}).unstack()['Rating']
plt.figure(figsize=(6, 5))
plt.barh([branch[11:] for branch in branch_ratings.index], 
         branch_ratings.values, 
         color=['forestgreen', 'navy', 'crimson'])
for val, p in zip(branch_ratings.values, [branch[11:] for branch in branch_ratings.index]):
    plt.text(val/2, p, round(val, 2), color='white', ha='center', 
            bbox=dict(boxstyle='round4', facecolor='black'))
plt.xlabel('Average rating')
plt.xticks(np.arange(0, 6))
plt.show()

## Additional Features

### Length of Review
It is the length of a review minus the spaces

In [None]:
df['review_len'] = df.Review_Text.apply(lambda x: len(x) - x.count(' '))

In [None]:
def univariate_dist(data, col, color=None, theme='ggplot', figsize=(12, 10), hist_bins='auto'):
    """
    This functions plots the univariate distribution - histogram, boxplot and qqplot, 
    for a pandas dataframe 
    """
    with plt.style.context(theme):
        fig = plt.figure(figsize=figsize)
        plt.subplots_adjust(wspace=0.5, hspace=0.4)
        spec = gridspec.GridSpec(2, 3, figure=fig)
        ax1 = fig.add_subplot(spec[0, :-1]) # first axis
        ax1.set_title('Histogram', color='crimson')
        ax2 = fig.add_subplot(spec[1, :-1]) # second axis
        ax2.set_title('QQ Plot', color='crimson')
        ax3 = fig.add_subplot(spec[:, -1:]) # third axis
        ax3.set_title('Boxplot', color='crimson')
        sns.histplot(data=data, x=col, ax=ax1, color=color, kde=True, bins=hist_bins)
        qqplot(data[col], fit=True, line='45', ax=ax2, color=color)
        sns.boxplot(y=data[col], ax=ax3, color=color)
        plt.suptitle(col.upper())
        return fig.show()

In [None]:
univariate_dist(df, 'review_len', 'goldenrod')

### Punctuations Percentage

In [None]:
punctuations = string.punctuation # list of punctuations
# percentage of punctuations
def count_punc(text):
    """This function counts the number of punctuations in a text"""
    count = sum(1 for char in text if char in punctuations)
    return round(count/(len(text) - text.count(" "))*100, 3)

# apply function
df['punc%'] = df['Review_Text'].apply(lambda x: count_punc(x))

In [None]:
univariate_dist(df, 'punc%')

### Word Count

In [None]:
df.head(1)

In [None]:
df['word_count'] = df['Review_Text'].apply(lambda x: len(x.split(' ')))

In [None]:
univariate_dist(df, 'word_count')

# Check Impurity

It is quit obvious that the relationship between the review length an the word count would be linear. But the plot below indicates some impurity in the dataset. There are some data points with a small review length but a extremly large word count. Further analysis needs to be conducted.

In [None]:
with plt.style.context('ggplot'):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='review_len', y='word_count')
    plt.text(188, 1330, 'Bad Data')
    plt.text(521, 5600, 'Bad Data')
    plt.text(168, 793, 'Bad Data')
    plt.text(145, 1175, 'Bad Data')

## Function to check impurity
- The below function searches for all the characters defined in RE_SUSPICIOUS

In [None]:
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
def impurity(text, min_len=10):
    """returns the share of suspicious characters in a text"""
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text))/len(text)*100

In [None]:
df['impurity'] = df['Review_Text'].apply(impurity)

- After applying the function to the dataframe, the dataframe is sorted with respect to the percecntage of impurity in decending order.
- It can be observed that there is maximum 3% impurity in the data. 

In [None]:
df.sort_values(by='impurity', ascending=False)