In [1]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


In [2]:
#data = pd.read_table('yelp_labelled.txt', names=('text', 'sentiment'))
#data = data.applymap(str)
data = pd.read_csv('Tweets.csv') #read data
list(data.columns) #list of features
print(data)
data = data[['text','airline_sentiment']] #keep only neccessary features

                 tweet_id airline_sentiment  airline_sentiment_confidence  \
0      570306133677760513           neutral                        1.0000   
1      570301130888122368          positive                        0.3486   
2      570301083672813571           neutral                        0.6837   
3      570301031407624196          negative                        1.0000   
4      570300817074462722          negative                        1.0000   
5      570300767074181121          negative                        1.0000   
6      570300616901320704          positive                        0.6745   
7      570300248553349120           neutral                        0.6340   
8      570299953286942721          positive                        0.6559   
9      570295459631263746          positive                        1.0000   
10     570294189143031808           neutral                        0.6769   
11     570289724453216256          positive                        1.0000   

In [3]:
data = data[data.airline_sentiment != "neutral"] #remove samples with label neutral as we want only positive or negative
data['text'] = data['text'].apply(lambda x: x.lower()) #convert text to lowecase
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) #remove punctuations, special symbols and emojis

#data["sentiment"]= data["sentiment"].replace("1", "positive") 
#data["sentiment"]= data["sentiment"].replace("0", "negative") 

print(data[ data['airline_sentiment'] == 'positive'].size) #counts number of positive example
print(data[ data['airline_sentiment'] == 'negative'].size)
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_features = 5000 #maximum number of entries in dictionary built by tokenizer
tokenizer = Tokenizer(num_words=max_features, split=' ') #split data
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values) #convert sentences to vector form
X = pad_sequences(X) #make all vectors of equal length

4726
18356


In [4]:
print(X.shape)

(11541, 33)


In [5]:
#build model
embed_dim = 128
lstm_out = 196

model = Sequential() #create model
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4)) #drops 40% nodes
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) #LSTM layer
model.add(Dense(2,activation='softmax'))  # for classification
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #compile model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 128)           640000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 33, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 895,194
Trainable params: 895,194
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
Y = pd.get_dummies(data['airline_sentiment']).values #connvert categorical labels to vector form
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42) #split train and test data
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7732, 33) (7732, 2)
(3809, 33) (3809, 2)


In [7]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1) #fit model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb775cb40b8>

In [8]:
#Evaluate model
validation_size = 500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.4f" % (score))
print("Accuracy: %.4f" % (acc))

score: 0.4628
Accuracy: 0.9178


In [9]:
#compute specificity, sencitivity, precision, recall and F1-score
pos_cnt, neg_cnt, pos_correct, neg_correct, pos_incorrect, neg_incorrect = 0, 0, 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]): #checks whether example is correctly classified
        if np.argmax(Y_validate[x]) == 0: #checks if examples was classified as negative
            neg_correct += 1
        else:
            pos_correct += 1
            
    else:
        if np.argmax(Y_validate[x]) == 0:
            neg_incorrect += 1
        else:
            neg_incorrect += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

sensitivity = pos_correct/pos_cnt
specificity = neg_correct/neg_cnt
precision = pos_correct/(pos_correct + pos_incorrect)
recall = sensitivity
f1 = (2 * precision * recall) / (precision + recall)

print("true positive rate: %.4f" %(sensitivity) )
print("true negative rate: %.4f" %(specificity) )
print("precision: %.4f" %(precision))
print("recall: %.4f" %(recall))
print("f1-score: %.4f" %(f1))

true positive rate: 0.7627
true negative rate: 0.9712
precision: 1.0000
recall: 0.7627
f1-score: 0.8654


In [10]:
#predict sentiment of random example
twt = ['Meetings: Because none of us is as dumb as all of us.']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=33, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0 1816  110  937   17   56   14   87
  1860   87   58   17   56]]
negative
