In [2]:
import pandas as pd
import numpy as np

In [3]:
train_data = pd.read_csv("train.tsv", sep="\t")
test_data = pd.read_csv("test.tsv", sep="\t")

In [4]:
print(train_data.head())
print(test_data.head())

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
   PhraseId  SentenceId                                             Phrase
0    156061        8545  An intermittently pleasing but mostly routine ...
1    156062        8545  An intermittently pleasing but mostly routine ...
2    156063        8545                                                 An
3    156064        8545  intermittently pleasing but mostly routine effort
4    156065        8545         intermittently pleasing but mostly

In [5]:
print(train_data.shape, test_data.shape)

(156060, 4) (66292, 3)


In [6]:
import nltk
from nltk.tokenize import word_tokenize
import random
import pickle
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re

lemmatizer = WordNetLemmatizer()

In [7]:
def process_sentences(df):
    
    sentences = []
    
    for sent in tqdm(df['Phrase']):
        
        #remove non alphanumeric characters
        replaced = re.sub(r'\W+ ', ' ', sent)
        
        #tokenize words
        words = word_tokenize(replaced)
        
        #lemmatize words
        lexicon = [lemmatizer.lemmatize(i) for i in words]
        
        sentences.append(lexicon)
        
    return sentences

        

In [None]:
sentences = process_sentences(train_data)

 81%|██████████████████████████████████████████████████████████              | 125956/156060 [00:17<00:04, 7134.24it/s]

In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

train_y = train_data['Sentiment'].values
train_y = to_categorical(train_y)
print(train_y)

test_sentences = process_sentences(test_data)

In [None]:
num_classes = train_y.shape[1]

In [None]:
def encode_words(sentences):
    all_words = []
    for words in tqdm(sentences):
        for word in words:
            all_words.append(word.lower())
    all_words = set(all_words)
    all_words = list(all_words)
    return all_words

unique_words = encode_words(sentences)

In [None]:
max_sentence_length = max([len(i) for i in sentences])
print(max_sentence_length)

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = len(unique_words))
tokenizer.fit_on_texts(sentences)

In [None]:
train_set = tokenizer.texts_to_sequences(sentences)
test_set = tokenizer.texts_to_sequences(test_sentences)

train_set = sequence.pad_sequences(train_set, maxlen=max_sentence_length)
test_set = sequence.pad_sequences(test_set, maxlen=max_sentence_length)

print(len(train_set), len(test_set))

In [None]:
def split_train_test(train_set, classification, test_size = 0.2):
    train = []
    for example, sentiment in tqdm(zip(train_set, classification)):
        train.append([(example), (sentiment)])
    random.shuffle(train)
    split = int(test_size*len(train))
    train_set = np.array(train[split:])
    val_set = np.array(train[:split])
    print(train_set[0])
    X_train = list(train_set[:,0])
    y_train = list(train_set[:,1])
    X_val = list(val_set[:,0])
    y_val = list(val_set[:,1])
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)    
        

In [None]:
X_train, X_val, y_train, y_val = split_train_test(train_set, train_y)

In [None]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
with open('sentiment_set.pickle','wb') as f:
    pickle.dump([X_train,X_val,y_train,y_val],f)

In [None]:
model = Sequential()
model.add(Embedding(len(unique_words), 300, input_length=max_sentence_length))

model.add(LSTM(128, activation="relu", return_sequences=True))

model.add(LSTM(64, activation="relu", return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(num_classes, activation="softmax"))

model.compile(loss='categorical_crossentropy',
             optimizer=Adam(lr=0.001),
             metrics=['accuracy'])



model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=2, batch_size=256,
                    validation_data=(X_val, y_val))