In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# This is for storing data file path
importdata = '' # '' meaning for define data type(String) 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        importdata = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import Counter, deque
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

# Import Data

In [None]:
data = pd.read_csv(importdata)
data

# Data Analysis

In [None]:
print(data.info())  # find Nan, number of columns object(5)
print(data.describe())  # check the values
pd.set_option('display.max_columns', 6)  # display all columns (object +1 = 6)
print(data.head())
Bottle = data.groupby(['Bottle_name']).size()
print(Bottle)

# Data Visualization

In [None]:
data['Brand'].value_counts().plot(kind='barh')
plt.show()

plt.subplot(1, 2, 1)
# opt = ["Yamazaki"] # this is for isin(), if I need to specify more, I cam write opt = ['Yamazaki', 'banrd name']
x = data.loc[data['Brand'].isin(["Yamazaki"])] # extract all values (rows) which specified by using isin() function
x['Bottle_name'].value_counts().plot(kind='bar')
plt.title("Yamazaki and Bottle_Name")

plt.subplot(1, 2, 2)
y = data.loc[data['Brand'].isin(["Hibiki"])]
y['Bottle_name'].value_counts().plot(kind='bar', color='green')
plt.title("Hibiki and Bottle_Name")
plt.show()

plt.subplot(1, 2, 1)
z = data.loc[data['Brand'].isin(["Nikka"])]
z['Bottle_name'].value_counts().plot(kind='bar', color='orange')
plt.title("Kikka and Bottle_Name")

plt.subplot(1, 2, 2)
w = data.loc[data['Brand'].isin(["Hakushu"])]
w['Bottle_name'].value_counts().plot(kind='bar', color='red')
plt.title("Hakushu and Bottle_Name")
plt.show()

# Cleaning the Texts

In [None]:
corpus = [] # corpus will only get in the end all the clean reviews
for i in range(len(data)):
    review = re.sub('[^a-zA-Z0-9_]', ' ', data['Review_Content'][i]) # for  remove all punctuation
    review = review.lower() # all the capital letters were turned into lowercase
    # review = review.split() # split different words
    words = word_tokenize(review) # split words
    ps = PorterStemmer() # loved -> love remove ed
    stop = set(stopwords.words('english'))
    stop.update(('Yamazaki', 'hibiki', 'it', 'whisky', 'whiskey', 'bottle')) # update stop words
    review = [ps.stem(word) for word in words if not word in stop]
    review = ' '.join(review) # adding space between each word of review
    corpus.append(review)

count = dict(Counter(word for sentence in corpus for word in sentence.split()))
df = list(count.items())  # count.items() to return a collection of the key-value pairs in count
# list(obj) with this collection as obj to convert in to a list

array = np.array(df)  # np.array(df) with this list as data to convert it to array (make 2D)
print(array)

# Create Positive, Negative, Neutral word list

In [None]:
sid = SentimentIntensityAnalyzer()
pos_word_list = []
neu_word_list = []
neg_word_list = []

for word in count:
    if (sid.polarity_scores(word)['compound']) >= 0.5:
        pos_word_list.append(word)
    elif (sid.polarity_scores(word)['compound']) <= -0.5:
        neg_word_list.append(word)
    else:
        neu_word_list.append(word)

print('Positive :', pos_word_list)
print('Neutral :', neu_word_list)
print('Negative :', neg_word_list)

# NLP

In [None]:
def prev_and_next(input_list):
    CURRENT = input_list
    PREV = deque(input_list)
    PREV.rotate(-1)
    PREV = list(PREV)
    NEXT = deque(input_list)
    NEXT.rotate(1)
    NEXT = list(NEXT)
    return zip(PREV, CURRENT, NEXT)

# Adding a column ('Sentiment') for NLP

In [None]:
data['Sentiment'] = 0

positiveList = pd.read_csv('/kaggle/input/positive-words/positive.txt') # import all positive words from online
negativelist = pd.read_csv('/kaggle/input/negative-words/nega.txt')

pos = positiveList.iloc[:, 0].values # for lemmatizer use: converting all words to the single distinct list 'awesome','good'
neg = negativelist.iloc[:, 0].values # converting all words to the single distinct list 'bad','hate'

index = 0 # accessing all rows from 1

lemmatizer = WordNetLemmatizer() # modeling and lemmatization that convert plural to singular ex) Feet -> Foot

ps = PorterStemmer()  # loved -> love remove ed
stop = set(stopwords.words('english'))
stop.remove('not') # for checking Not + any positive words: Not Like
stempos = [ps.stem(word) for word in pos if not word in stop] # for positiveList Ps use: this is for the review which will applied stemming in the loop
stemnega = [ps.stem(word) for word in neg if not word in stop] # this is for the review which will applied stemming in the loop

corpus = [] # collecting all stemming and lemmatazed words
for i in data['Review_Content']:
    review = re.sub('[^a-zA-Z]', ' ', i)  # for  remove all punctuation
    review = review.lower()  # all the capital letters were turned into lowercase
    words = word_tokenize(review) # split words

    lemWords = [lemmatizer.lemmatize(w) for w in words] # for words lemmatizer

    # This loop will stop when itr finds one of the positive or negative words (lemmatization) in a sentence.
    for itr in lemWords: # check a word all in lemWords(h)
        if itr in pos:   # check h in pos
            data.at[index, 'Sentiment'] = 1 # found positive word first
            break # finish loop
        elif itr in neg: # check h in neg
            data.at[index, 'Sentiment'] = 0 # found negative word first
            break

    review = [ps.stem(word) for word in lemWords if not word in stop] # for NLP by lemWords PorterStemmer

    # I assumed that sentences that have any negative word first are already negative.
    # This loop point is for finding 'Not + any positive words' such as "NOT LIKE"
    if (data.at[index, 'Sentiment'] == 1): # for find next has negative word    'It's okay but I do not like it'
        for previous, current, next in prev_and_next(review): # assume the next words are any positive words
            if current == 'not':
                if next in stempos: # stempos is stemmed as review needs to be stemmed for NLP
                    data.at[index, 'Sentiment'] = 0
                    break # no longer this loop needs to search

    index += 1 # index is increasing
    review = ' '.join(review)
    corpus.append(review) # stores stemmed and lemmatized words here

sentiment = data['Sentiment'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
print(sentiment)

# Data Analysis for NLP

In [None]:
count = dict(Counter(word for sentence in corpus for word in sentence.split()))
df = list(count.items())  # count.items() to return a collection of the key-value pairs in count
# list(obj) with this collection as obj to convert in to a list

array = np.array(df)  # np.array(df) with this list as data to convert it to array (make 2D)
array1d = np.array(array[:, -1], dtype='i') # i = int

# describe outlier
print(np.median(array1d))
print(np.mean(array1d))
print(np.std(array1d))  # for check the array type
print(array1d[(array1d>np.quantile(array1d, 0.1)) & (array1d<np.quantile(array1d, 0.9))].tolist())

# visualizing outlier
plt.boxplot(array1d)
plt.show()

# Creating Removing Words from the Above Steps

In [None]:
removewords = [word for word in count if (count[word] > 50) & (not word in stempos and not word in stemnega)]

removeIndex = []
for word in removewords:
    if word in corpus:
        removeIndex.append(corpus.index(word))
        corpus.remove(word)
data = data.drop(removeIndex)

# Creating the Bag of Words model

In [None]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

max_words = int(len(x[0] * 0.9))
cv = CountVectorizer(max_features=max_words)
x = cv.fit_transform(corpus).toarray()

# Applying NLP

In [None]:
# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

# Training the Naive Bayes model on the Training set
classifier = GaussianNB()
classifier.fit(x_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)