In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn')

import nltk
from nltk.corpus import stopwords
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string
nltk.download('stopwords')

stop = set(stopwords.words('english'))

SEED = 13

In [None]:
data = pd.read_csv('/kaggle/input/disneyland-reviews/DisneylandReviews.csv', encoding='iso-8859-1', na_values='missing')

In [None]:
data.shape

In [None]:
data.info()

In [None]:
#checking duplicated
data[data['Review_ID'].isin(data['Review_ID'][data['Review_ID'].duplicated()])].sort_values('Review_ID')

In [None]:
data.drop_duplicates('Review_ID', inplace=True, keep='first')

In [None]:
data.shape

In [None]:
data.Rating.value_counts()

In [None]:
def remap_rating(rating):
    if rating <3:
        return 'negative'
    elif rating >3:
        return 'positive'
    else:
        return 'neutral'
    
data['Rating'] = data['Rating'].apply(lambda rating: remap_rating(rating))

In [None]:
data.Rating.value_counts()

In [None]:
data['Review_Text'] = data['Review_Text'].astype(str)
class_data = data.groupby('Rating').count()['Review_Text'].reset_index().sort_values('Review_Text', ascending=False)
percent_rating = class_data.Review_Text
labels = class_data.Rating
colors = ['#00ff00', '#0000ff', '#ff0000']
chart, _, _ = plt.pie(percent_rating, colors=colors, radius=1.0, labels=labels, autopct="%.1f%%")
plt.setp(chart, width=0.5)
plt.show()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,5))

review_len = data[data['Rating']=='negative']['Review_Text'].str.len()
ax1.hist(review_len, color='#ff0000')

review_len = data[data['Rating']=='neutral']['Review_Text'].str.len()
ax2.hist(review_len, color='#0000ff')

review_len = data[data['Rating']=='positive']['Review_Text'].str.len()
ax3.hist(review_len, color='#00ff00')

fig.suptitle('Number of characters in reviews')
plt.show()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,5))

review_len = data[data['Rating']=='negative']['Review_Text'].str.split().map(lambda review: len(review))
ax1.hist(review_len, color='#ff0000')

review_len = data[data['Rating']=='neutral']['Review_Text'].str.split().map(lambda review: len(review))
ax2.hist(review_len, color='#0000ff')

review_len = data[data['Rating']=='positive']['Review_Text'].str.split().map(lambda review: len(review))
ax3.hist(review_len, color='#00ff00')

fig.suptitle('Total words in a review')
plt.show()

In [None]:
def create_corpus(target):
    corpus = []
    for x in data[data['Rating']==target]['Review_Text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
stop = set(stopwords.words('english'))

In [None]:
pos_common_words = create_corpus('positive')
pos_counter = Counter(pos_common_words)
pos_most = pos_counter.most_common()
x = []
y = []
for word, count in pos_most[:40]:
    if word not in stop:
        x.append(word)
        y.append(count)
sns.barplot(x=y, y=x)
plt.title('Most common words in positive reviews')
plt.show()

In [None]:
neu_common_words = create_corpus('neutral')
neu_counter = Counter(neu_common_words)
neu_most = neu_counter.most_common()
x = []
y = []
for word, count in neu_most[:40]:
    if word not in stop:
        x.append(word)
        y.append(count)
sns.barplot(x=y, y=x)
plt.title('Most common words in neutral reviews')
plt.show()

In [None]:
neg_common_words = create_corpus('negative')
neg_counter = Counter(neg_common_words)
neg_most = neg_counter.most_common()
x = []
y = []
for word, count in neg_most[:40]:
    if word not in stop:
        x.append(word)
        y.append(count)
sns.barplot(x=y, y=x)
plt.title('Most common words in negative reviews')
plt.show()

In [None]:
import re
import random

In [None]:
def remove_punctuations(review):
    return re.sub(r'(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^\x00-\x7F]+)|([0-9])|(\w+:\/\/\S+)|([^\w\s])|(\s+)', ' ', review)

def rep(review):
    return review.replace('_', ' ')

def whitespace_LT(review):
    return review.strip()

def multispace(review):
    return re.sub(r'\s+', ' ', review)

In [None]:
data['Review_Text'] = data['Review_Text'].str.lower()

data['Review_Text'] = data['Review_Text'].apply(lambda review: remove_punctuations(review))
data['Review_Text'] = data['Review_Text'].apply(lambda review: rep(review))
data['Review_Text'] = data['Review_Text'].apply(lambda review: whitespace_LT(review))
data['Review_Text'] = data['Review_Text'].apply(lambda review: multispace(review))

In [None]:
data['Review_Text'][random.randint(0, len(data['Review_Text']))]

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

def word_tokenize_wrapper(review):
    return word_tokenize(review)

def freqDist_wrapper(review):
    return FreqDist(review)

In [None]:
data['Review_Text_Token'] = data['Review_Text'].apply(lambda review: word_tokenize_wrapper(review))

In [None]:
data['Review_Text_Token_FreqDist'] = data['Review_Text_Token'].apply(lambda token: freqDist_wrapper(token))

In [None]:
from nltk.corpus import stopwords
list_stopwords = stopwords.words('english')
list_stopwords = set(list_stopwords)

In [None]:
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

In [None]:
data['Review_Text_Token_WSW'] = data['Review_Text_Token'].apply(lambda word: stopwords_removal(word))

In [None]:
data['Review_Text_Token_WSW'].head()

In [None]:
!pip install swifter

In [None]:
from nltk.stem import PorterStemmer
import swifter

stemmer = PorterStemmer()

def stemmer_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in data['Review_Text_Token_WSW']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))

for term in term_dict:
    term_dict[term] = stemmer_wrapper(term)
    print(term, ':', term_dict[term])
    
print(term_dict)

def get_stemmed_term(document):
    return [term_dict[term] for term in document]

data['Review_Stemmed'] = data['Review_Text_Token_WSW'].swifter.apply(lambda doc: get_stemmed_term(doc))
print(data['Review_Stemmed'])

In [None]:
data.head()

In [None]:
cols = ['Rating', 'Review_Stemmed']
data = data[cols]
data.columns = ['label', 'review']

In [None]:
import ast
def join(reviews):
    return ' '.join([review for review in reviews])

data['review'] = data['review'].apply(lambda review: join(review))

In [None]:
data.label.value_counts(normalize=True)

In [None]:
data.info()

In [None]:
def remap_label(label):
    if label == 'positive':
        return 1
    elif label == 'negative':
        return -1
    else:
        return 0

In [None]:
data['label'] = data['label'].apply(lambda label: remap_label(label))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(analyzer='word')
features = tfidf.fit_transform(data['review'])
features_array = features.toarray()

In [None]:
features = features.astype(np.float32)
features_array = features_array.astype(np.float32)

In [None]:
labels = data['label']
print('%d reviews, %d feature' %(features.shape))

In [None]:
X = features
y = labels
X = X.astype(np.float32)
y = y.astype(np.int8)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

In [None]:
acc_score = []
f1s_score = []
for train_idx, val_idx in skf.split(X, y):
    Xt, Xv = X[train_idx, :], X[val_idx, :]
    yt, yv = y.iloc[train_idx], y.iloc[val_idx]

    model = RandomForestClassifier(
        criterion='entropy',
        random_state=SEED,
        n_estimators=500,
        max_features='sqrt',
        n_jobs=-1
    )

    model.fit(Xt, yt)
    y_pred = model.predict(Xv)
    y_prob = model.predict_proba(Xv)[:,1]
    acc_score.append(accuracy_score(yv, y_pred))
    f1s_score.append(f1_score(yv, y_pred, average='macro'))
        
acc_mean = np.mean(acc_score)
f1s_mean = np.mean(f1s_score)
df_result = pd.DataFrame({
    'Accuracy': [acc_mean],
    'F1 Score': [f1s_mean],
})

In [None]:
df_result