# Universal Studios Reviews - EDA

## 1. Import Libraries

In [None]:
# data manipulation
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# text data
from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
import re
import string

# sentiment analysis
from textblob import TextBlob
from wordcloud import WordCloud

## 2. Data Wrangling

### Read Data

In [None]:
df = pd.read_csv('../input/reviewuniversalstudio/universal_studio_branches.csv', encoding = 'UTF-8')
df.head()

In [None]:
df.isnull().sum()

### Change Data Type

In [None]:
df.dtypes

From the information above we should change data type of `written_date` into datetime64

In [None]:
df['written_date'] = pd.to_datetime(df['written_date'])
df.dtypes

In [None]:
df.nunique()

Also we need to change `branch` into category data type

In [None]:
df['branch'].unique()

In [None]:
df['branch'] = df['branch'].astype('category')
df.dtypes

### Removing Duplicates Data

In [None]:
df.shape

**We have 50,904 rows and 6 columns.**<br><br>
For the further analysing the duplicate data, in this case is `review_text` indicate that they also contain the same information in other columns too. So wee need to drop the duplicate and keeping first.

In [None]:
df.drop_duplicates(subset='review_text', inplace=True, keep='first')
df.shape

### Data Summary

In [None]:
print ("Number of Rows     :" ,df.shape[0])
print ("Number of Columns  :" ,df.shape[1])
print ("\nFeatures         :\n" ,list(df.columns))
print ("\nMissing values   :", df.isnull().sum().sum())
print ("\nUnique values    :\n",df.nunique())
print ("\nBranches Reviews :\n",df['branch'].value_counts())

There is some bias in the data to Branches feature value counts aren't fairly distributed

## 3. Feature Engineering

### Get Period Information of `written_date`

In [None]:
df['quarter'] = df['written_date'].dt.to_period('Q')
df.head()

### Get Review Length Feature of `written_text`

In [None]:
df['review_text'].values[4:5]

### Get Sentiment Analysis Feature

Let's clean the `review_text` column first by define a function:

In [None]:
def text_clean(text):
    # case folding
    text = text.lower()
    
    ## Decontract text            
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"\'t've", " not have", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'clock", "f the clock", text)
    text = re.sub(r"\'cause", " because", text)
    
    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’s", " is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)
    
    # remove all puctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # text = re.sub('\w*\d\w*', '', text) #remove digit/number
    
    # remove all special characters
    text = re.sub(r'\W', ' ', text)

    # remove break
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    
    # remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    return text

In [None]:
df_text = pd.DataFrame()
df_text['clean_review_text'] = df['review_text'].apply(lambda x: text_clean(x))

Let's check before and after cleansing

In [None]:
df.loc[44,'review_text']

In [None]:
df_text.loc[44,'clean_review_text']

**Sentiment Analysis**<br>
Define function to get the subjectivity and polarity of text

In [None]:
def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
   return  TextBlob(text).sentiment.polarity

create column subjectivity and polarity and sentiment analysis

In [None]:
df_text['subjectivity'] = df_text['clean_review_text'].apply(getSubjectivity)
df_text['polarity'] = df_text['clean_review_text'].apply(getPolarity)
def getsentiment(score):
 if score < 0:
  return 'negative'
 elif score == 0:
  return 'neutral'
 else:
  return 'positive'
df_text['sentiment'] = df_text['polarity'].apply(getsentiment)
df_text.head()

In [None]:
df_text['sentiment'].value_counts()

Beside, assume that rating below 3.0 is 'negative'

In [None]:
df['sentiment'] = df_text['sentiment']
df['sentiment'] = df['rating'].apply(lambda x: 'negative' if x<3.0 else np.nan)
df['sentiment'] = df['sentiment'].fillna(df_text['sentiment'])
df.head()

In [None]:
df['sentiment'].value_counts()