In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Data Load and Check

In [None]:
train = pd.read_table('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip')
test = pd.read_table('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip')
sub = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')

In [None]:
train.head()

In [None]:
train['Sentiment'].value_counts()

In [None]:
sub.head()

# 2.1 Combining Data

In [None]:
all_data = pd.concat([train, test])

all_data.head()

# 2.2 Phrase Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer

tk = Tokenizer()
tk.fit_on_texts(all_data['Phrase'])

len(tk.word_index)

In [None]:
# word indexing
for i, word in enumerate(tk.word_index.items()):
    if i > 5:
        break
        
    print(word)

In [None]:
# word counting
for i, word in enumerate(tk.word_counts.items()):
    if i > 5:
        break
        
    print(word)

In [None]:
all_text = tk.texts_to_sequences(all_data['Phrase'])

print(all_data.iloc[0]['Phrase'])
print(all_text[0])

# 2.3 Making pad

In [None]:
from keras.preprocessing.sequence import pad_sequences

all_pad = pad_sequences(all_text)

all_pad.shape

In [None]:
all_pad[0]

# 2.4 Separating Data 

In [None]:
train2 = all_pad[:len(train)]
test2 = all_pad[len(train):]

print(train2.shape)
print(test2.shape)

In [None]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(train2, train['Sentiment'], test_size=0.2, random_state=42)

print(train_x.shape, train_y.shape)
print(val_x.shape, val_y.shape)

# 3. Modeling

In [None]:
from keras.models import Sequential
from keras.layers import *

model = Sequential()
model.add(Embedding(len(tk.word_index)+1, 300, input_length = 52))
model.add(SimpleRNN(10))
model.add(Dense(5, activation = 'softmax'))

model.compile(metrics = ['acc'], loss = 'sparse_categorical_crossentropy', optimizer ='adam')

model.summary()

# 4. Model Training

In [None]:
history = model.fit(train_x, train_y, epochs = 2, batch_size = 512, validation_data = (val_x, val_y))
result = model.predict(test2)

In [None]:
result[:5]

In [None]:
result.argmax(1)[:5]

# 5. Submission

In [None]:
#sub['Sentiment'] = result.argmax(1)
#sub.to_csv('sent.csv', index=False)

------------

# 3A. Modeling - SentenceBERT

In [None]:
!pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

sentences = ["This is an example sentence", "Each sentence is converted"]

sbert = SentenceTransformer('sentence-transformers/sentence-t5-base')
embeddings = sbert.encode(sentences)

print(embeddings.shape)

In [None]:
encoded_phrases = sbert.encode(train['Phrase'])

print(encoded_phrases.shape)

In [None]:
encoded_phrases_test = sbert.encode(test['Phrase'])

print(encoded_phrases_test.shape)

In [None]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(encoded_phrases, train['Sentiment'], test_size=0.2, random_state=42)

print(train_x.shape, train_y.shape)
print(val_x.shape, val_y.shape)

In [None]:
from keras.models import Sequential
from keras.layers import *

model2 = Sequential([
    Dense(512, input_shape=(768,), activation='relu'),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax'),
])

model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

model2.summary()

In [None]:
history2 = model2.fit(train_x, train_y, epochs=30, batch_size=1024, validation_data=(val_x, val_y))
result2 = model2.predict(encoded_phrases_test)

In [None]:
sub['Sentiment'] = result2.argmax(1)
sub.to_csv('submission.csv', index=False)