In [1]:

import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import re
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def preprocess_text(text):
    text= re.sub(r"^\s+|(@[A-Za-z]+)|([^A-Za-z \t])|(,\w+:\/\/\S+)"," ",text)
    text=" ".join(text.split())
    text= text.lower()
    lemmatizer = WordNetLemmatizer()
    split = text.split(' ')
    text = ' '.join([lemmatizer.lemmatize(w,'v') for w in split])    
    return text
    
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop]
    return ' '.join(no_stopword_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train = pd.read_table("/content/train.tsv")
print(train.columns)
print(train.shape)
test = pd.read_table("/content/test.tsv")
print(test.columns)

  """Entry point for launching an IPython kernel.


Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')
(156060, 4)
Index(['PhraseId', 'SentenceId', 'Phrase'], dtype='object')


  after removing the cwd from sys.path.


In [0]:
train['Sent'] = train['Phrase'].apply(lambda x: preprocess_text(x))
test['Sent'] = test['Phrase'].apply(lambda x: preprocess_text(x))

In [0]:
stop = set(stopwords.words('english'))
extra_stopwords = set(['none','high','pow','us','whatever','n','lrb','rrb','b'])
stop = stop.union(extra_stopwords)
train['Sent'] = train['Sent'].apply(lambda x: remove_stopwords(x))
test['Sent'] =  test['Sent'].apply(lambda x: remove_stopwords(x))

In [0]:
updated_labels=[]
for i in range(train.shape[0]):
  if train['Sentiment'][i]==0:
    # 1 for negative
    updated_labels.append(1)
  elif train['Sentiment'][i]==4:
    # 3 for positive
    updated_labels.append(3)
  else:
    # 2 for for neutral
    updated_labels.append(train['Sentiment'][i])
train['Sentiment']=updated_labels

In [0]:
for i in range(len(train['Sent'])):
  if train['Sent'][i]=='':
    train.drop([i],axis=0,inplace=True)
train.reset_index(inplace=True)
for i in range(len(test['Sent'])):
  if test['Sent'][i]=='':
    test.drop([i],axis=0,inplace=True)
test.reset_index(inplace=True)
    

In [8]:
from keras.preprocessing.text import Tokenizer
token=Tokenizer()
token.fit_on_texts(train['Sent'].values)
train['vectors']=token.texts_to_sequences(train['Sent'])
test['vectors']=token.texts_to_sequences(test['Sent'])
#print(train['vectors'][0:5])
from tensorflow.keras.preprocessing.sequence import pad_sequences
len_train = max([len(s.split()) for s in train['Sent']])
len_test = max([len(s.split()) for s in test['Sent']])
if len_train>len_test:
  max_length = len_train
else:
  max_length = len_test
train_vectors = pad_sequences(train['vectors'], max_length)
test_vectors = pad_sequences(test['vectors'], max_length)
#print(train_vectors.shape)


Using TensorFlow backend.


In [9]:
index_of_words = token.word_index
print(len(index_of_words))

12380


In [10]:
from keras.utils import to_categorical
target=train.Sentiment.values
labels=to_categorical(target-1)
num_classes=labels.shape[1]
print(labels)
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val= train_test_split(train_vectors,labels,test_size=0.2,random_state=4)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [11]:
f = open('glove.6B.100d.txt')
embedd_index = {}
for line in f:
    val = line.split()
    word = val[0]
    coff = np.asarray(val[1:],dtype = 'float')
    embedd_index[word] = coff

f.close()
print('Found %s word vectors.' % len(embedd_index))

Found 400000 word vectors.


In [0]:
embed_num_dims = 100
embedding_matrix = np.zeros((len(index_of_words) + 1, embed_num_dims))

tokens = []
labels = []

for word,i in index_of_words.items():
    temp = embedd_index.get(word)
    if temp is not None:
        embedding_matrix[i] = temp
        
#for plotting
        tokens.append(embedding_matrix[i])
        labels.append(word)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,GlobalAveragePooling1D
# LSTM
from tensorflow.keras.layers import LSTM

#EMBEDDING_DIM = 100
unknown = len(token.word_index)+1
lstm_model = Sequential()
#lstm_model.add(Embedding(unknown, EMBEDDING_DIM, input_length = max_length))
lstm_model.add(Embedding(len(index_of_words) + 1 , embed_num_dims , input_length = max_length , weights = [embedding_matrix]))
lstm_model.add(LSTM(52,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
lstm_model.add(GlobalAveragePooling1D())
lstm_model.add(Dense(3, activation = 'softmax'))
lstm_model.summary()

lstm_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 29, 100)           1238100   
_________________________________________________________________
lstm_1 (LSTM)                (None, 29, 52)            31824     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 52)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 159       
Total params: 1,270,083
Trainable params: 1,270,083
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = lstm_model.fit(x_train,
                    y_train,
                    epochs = 5,
                    batch_size = 512,
                    validation_data = (x_val,y_val),
                    verbose = 1)


W0904 07:42:09.782897 140059901138816 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 123544 samples, validate on 30886 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
results = lstm_model.evaluate(x_train,y_train)
print("Training Accuracy",results[1])


Training Accuracy 0.82132286


In [0]:
predict_labels = lstm_model.predict_classes(train_vectors)


In [0]:
import csv
lstm_output = pd.DataFrame(columns=['Phrases','Actual','Predicted'])
sent=[]
actual_label=[]
predicted_label=[]
for i in range(train.shape[0]):
  sent.append(train['Sent'][i])
  actual_label.append(train['Sentiment'][i])
  predicted_label.append(predict_labels[i]+1)
lstm_output['Phrases'] = sent
lstm_output['Actual'] = actual_label
lstm_output['Predicted'] = predicted_label
#lstm_output.to_csv("/content/lstm_sa_test.csv")
