In [2]:
import pandas as pd
import numpy as np

In [3]:
train_data = pd.read_csv("train.tsv", sep="\t")
test_data = pd.read_csv("test.tsv", sep="\t")

In [4]:
print(train_data.head())
print(test_data.head())

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
   PhraseId  SentenceId                                             Phrase
0    156061        8545  An intermittently pleasing but mostly routine ...
1    156062        8545  An intermittently pleasing but mostly routine ...
2    156063        8545                                                 An
3    156064        8545  intermittently pleasing but mostly routine effort
4    156065        8545         intermittently pleasing but mostly

In [5]:
print(train_data.shape, test_data.shape)

(156060, 4) (66292, 3)


In [6]:
import nltk
from nltk.tokenize import word_tokenize
import random
import pickle
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re

lemmatizer = WordNetLemmatizer()

In [7]:
def process_sentences(df):
    
    sentences = []
    
    for sent in tqdm(df['Phrase']):
        
        #remove non alphanumeric characters
        replaced = re.sub(r'\W+ ', ' ', sent)
        
        #tokenize words
        words = word_tokenize(replaced)
        
        #lemmatize words
        lexicon = [lemmatizer.lemmatize(i) for i in words]
        
        sentences.append(lexicon)
        
    return sentences

        

In [8]:
sentences = process_sentences(train_data)

100%|████████████████████████████████████████████████████████████████████████| 156060/156060 [00:21<00:00, 7100.44it/s]


In [9]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

train_y = train_data['Sentiment'].values
train_y = to_categorical(train_y)
print(train_y)

test_sentences = process_sentences(test_data)

[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]


100%|██████████████████████████████████████████████████████████████████████████| 66292/66292 [00:09<00:00, 6633.35it/s]


In [10]:
num_classes = train_y.shape[1]

In [11]:
def encode_words(sentences):
    all_words = []
    for words in tqdm(sentences):
        for word in words:
            all_words.append(word.lower())
    all_words = set(all_words)
    all_words = list(all_words)
    return all_words

unique_words = encode_words(sentences)

100%|██████████████████████████████████████████████████████████████████████| 156060/156060 [00:00<00:00, 289242.15it/s]


In [12]:
max_sentence_length = max([len(i) for i in sentences])
print(max_sentence_length)

49


In [13]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer

In [14]:
tokenizer = Tokenizer(num_words = len(unique_words))
tokenizer.fit_on_texts(sentences)

In [15]:
train_set = tokenizer.texts_to_sequences(sentences)
test_set = tokenizer.texts_to_sequences(test_sentences)

train_set = sequence.pad_sequences(train_set, maxlen=max_sentence_length)
test_set = sequence.pad_sequences(test_set, maxlen=max_sentence_length)

print(len(train_set), len(test_set))

156060 66292


In [16]:
def split_train_test(train_set, classification, test_size = 0.2):
    train = []
    for example, sentiment in tqdm(zip(train_set, classification)):
        train.append([(example), (sentiment)])
    random.shuffle(train)
    split = int(test_size*len(train))
    train_set = np.array(train[split:])
    val_set = np.array(train[:split])
    print(train_set[0])
    X_train = list(train_set[:,0])
    y_train = list(train_set[:,1])
    X_val = list(val_set[:,0])
    y_val = list(val_set[:,1])
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)    
        

In [17]:
X_train, X_val, y_train, y_val = split_train_test(train_set, train_y)

156060it [00:00, 841275.84it/s]


[array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    3,  570,    4, 1134])
 array([0., 0., 1., 0., 0.], dtype=float32)]


In [18]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(124848, 49) (124848, 5) (31212, 49) (31212, 5)


In [19]:
with open('sentiment_set.pickle','wb') as f:
    pickle.dump([X_train,X_val,y_train,y_val],f)

In [20]:
model = Sequential()
model.add(Embedding(len(unique_words), 300, input_length=max_sentence_length))

model.add(LSTM(128, activation="relu", return_sequences=True))

model.add(LSTM(64, activation="relu", return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(num_classes, activation="softmax"))

model.compile(loss='categorical_crossentropy',
             optimizer=Adam(lr=0.001),
             metrics=['accuracy'])



model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 49, 300)           4629300   
_________________________________________________________________
lstm (LSTM)                  (None, 49, 128)           219648    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1

In [21]:
history = model.fit(X_train, y_train, epochs=2, batch_size=256,
                    validation_data=(X_val, y_val))

W0731 23:54:02.826111 13132 deprecation.py:323] From c:\users\user\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 124848 samples, validate on 31212 samples
Epoch 1/2






KeyboardInterrupt: 