In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


In [2]:
data = pd.read_csv('data/trainPosNeg.csv')

# Keeping only the neccessary columns
data = data[['text','sentiment']]

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

84192
68424


In [3]:
try:
    data['text'] = data['text'].apply(lambda x: x.lower())
    data['text'] = data['text'].apply((lambda x: re.sub("[^a-zA-z0-9'\s]",'',x)))
except AttributeError:
    print('error')

In [4]:
for idx,row in data.iterrows():
    try:
        row[0] = row[0].replace('rt',' ')
    except:
        print('error '+ str(row[0]))

In [5]:
X1 = data['text'].values

max_features = 20000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X1)
X = tokenizer.texts_to_sequences(X1)
X = pad_sequences(X)
Y = []
for y in data['sentiment'].values:
    if y == 'Positive':
        Y.append(1)
    else:
        Y.append(0)

In [6]:
embed_dim = 128
lstm_out = 196
print('input length = '+str(X.shape[1]))
intputLen = X.shape[1]

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

input length = 49
Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,len(Y_train))
print(X_test.shape,len(Y_test))

batch_size = 32
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = [Y_test[-validation_size:]]
X_test = X_test[:-validation_size]
Y_test = [Y_test[:-validation_size]]

model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1,validation_data=(X_validate, Y_validate))


score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

(51126, 49) 51126
(25182, 49) 25182
Train on 51126 samples, validate on 1500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
score: 0.50
acc: 0.91


In [8]:
def analyseSentiment(data):
    twt = [data]
    #vectorizing the tweet by the pre-fitted tokenizer instance
    twt = tokenizer.texts_to_sequences(twt)
    #padding the tweet to have exactly the same shape as `embedding_2` input
    twt = pad_sequences(twt, maxlen=intputLen, dtype='int32', value=0)
    sentiment = model.predict(twt,batch_size=1,verbose = 2)
    if(sentiment[0] < 0.5):
        return "Negative with "+ "{0:.2f}".format((1 - sentiment[0][0]) * 100)+ "% Confidence."
    else:
        return "Positive with "+"{0:.2f}".format(sentiment[0][0] * 100)+"% Confidence."

In [14]:
print(analyseSentiment("You are great."))

Positive with 100.00% Confidence.


In [12]:
model.save('sentiment_model.h5')

In [13]:
from keras.models import load_model
model = load_model('sentiment_model.h5')