In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
train = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)

In [None]:
test = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip',header=0, delimiter="\t", quoting=3)

In [None]:
train.head()

In [None]:
train['review'][0]

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

## EDA

In [None]:
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk 
import warnings 
warnings.filterwarnings(action='ignore')
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
plt.style.use('ggplot')

nltk.download('stopwords')
stop=set(stopwords.words('english'))

In [None]:
train['sentiment'].value_counts().to_frame()

In [None]:
train.info()

### stopwords 분포

In [None]:
def plot_top_stopwords_barchart(text):
    stop=set(stopwords.words('english'))
    
    new=text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]
    
    from collections import defaultdict
    dic=defaultdict(int)
    for word in corpus:
        if word in stop:
            dic[word]+=1
            
    top=sorted(dic.items(),key=lambda x:x[1],reverse=True)[:10]
    x,y=zip(*top)
    plt.bar(x,y)
    
plot_top_stopwords_barchart(train['review'])

### corpus 말뭉치

In [None]:
from nltk.corpus import stopwords
import seaborn as sns
from collections import Counter

def plot_top_non_stopwords_barchart(text):
  stop=set(stopwords.words('english'))

  new=text.str.split()
  new=new.values.tolist()
  corpus=[word for i in new for word in i]

  counter=Counter(corpus)
  most=counter.most_common()
  x,y=[],[]
  for word,count in most[:40]:
    if (word not in stop):
      x.append(word)
      y.append(count)

  sns.barplot(x=y,y=x)

plot_top_non_stopwords_barchart(train['review'])

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

plt.figure(figsize=(12,5))
for i in range(2):
  plt.subplot(1,2,i+1)
  plot_top_non_stopwords_barchart(train[train['sentiment']==i]['review'])
  plt.title(i)
plt.tight_layout()
plt.show()

### Wordcloud

In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
import pyLDAvis.gensim

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud,STOPWORDS
import nltk

nltk.download('stopwords')
stop=set(stopwords.words('english'))

def plot_wordcloud(text):
  def _preprocess_text(text):
    corpus=[]
    stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for news in text:
      words=[w for w in word_tokenize(news) if (w not in stop)]
      words=[lem.lemmatize(w) for w in words if len(w)>2]
      corpus.append(words)
    return corpus

  corpus=_preprocess_text(text)
  
  wordcloud=WordCloud(
      background_color='white',
      stopwords=set(STOPWORDS),
      max_words=100,
      max_font_size=30,
      scale=3,
      random_state=1
  )
  wordcloud=wordcloud.generate(str(corpus))
  return wordcloud

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
fig = plt.figure(figsize=(20,20)) # rows*cols 행렬의 i번째 subplot 생성
rows = 2
cols = 1

for i in range(2):
    wordcloud = plot_wordcloud(train[train['sentiment']==i]['review'])
    ax = fig.add_subplot(rows, cols, i+1)
    
    ax.axis('off')
    ax.set_title(i)
    ax.imshow(wordcloud)
    #plt.show()

## 데이터 전처리

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words("english")
def clean(review):
    clean_non_letters = re.sub("[^a-zA-Z]", " ", review)
    cleaned_lowercase = clean_non_letters.lower()
    words = cleaned_lowercase.split()
    cleaned_words = [w for w in words if w not in stop_words]
    return " ".join(cleaned_words)

train["cleaned_review"] = train["review"].apply(clean)
train

In [None]:
# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer=Tokenizer(num_words=25000)
tokenizer.fit_on_texts(train['cleaned_review'])
total_words=len(tokenizer.word_index)+1
total_words

In [None]:
split_size=int(len(train)*0.8)

X_train=train['review'][:split_size]
X_test=train['sentiment'][:split_size]
y_train=train['review'][split_size:]
y_test=train['sentiment'][split_size:]

In [None]:
vocab_size=10000
embedding_dim=32
max_length=200
trunc_type='post'
oov_tok="<OOV>"

tokenizer=Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

train_sequences=tokenizer.texts_to_sequences(X_train)
padded_train=pad_sequences(train_sequences,maxlen=max_length)

test_sequences=tokenizer.texts_to_sequences(y_train)
padded_test=pad_sequences(test_sequences,maxlen=max_length)



In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, accuracy=0.85):
        self.accuracy = accuracy
        
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('val_accuracy')>=self.accuracy):
            print(f"\nReached {self.accuracy} accuracy so cancelling training!")
            self.model.stop_training = True

In [None]:
# Building the Neural Network
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

callbacks = CustomCallback()

model.compile(loss='binary_crossentropy',
             optimizer=tf.optimizers.Adam(learning_rate=0.001),
             metrics=['accuracy'])

model.summary()

model.fit(padded_train, X_test, epochs=50, callbacks=[callbacks], validation_data=(padded_test, y_test))

In [None]:
accuracy_score = model.evaluate(padded_test, y_test, verbose=0)[1]
print(f"Accuracy Score: {round(accuracy_score*100, 2)}%")

In [None]:
test_sequences = tokenizer.texts_to_sequences(test['review'])
padded_test = pad_sequences(test_sequences, maxlen=max_length)

sentiment_predictions = list(map(lambda sentiment: 1 if sentiment > 0.5 else 0, model.predict(padded_test)))
pd.DataFrame({'Predictions':sentiment_predictions}).head(10)

In [None]:
# Submitting the results
output = pd.DataFrame(data={"id":test.id, 
                            "sentiment":sentiment_predictions})

output.to_csv("submission.csv", index=False, quoting=3)