# Sentiment Analysis

In [9]:
import keras
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [94]:
data = pd.read_csv('./data/Sentiment.csv')
data = data[['text','sentiment']]
data = data[data.sentiment != "Neutral"]
print('Number of positive reviews: {}'.format((data['sentiment'] == 'Positive').sum()))
print('Number of negative reviews: {}'.format((data['sentiment'] == 'Negative').sum()))

Number of positive reviews: 2236
Number of negative reviews: 8493


### Data cleaning

In [96]:
## arguments
padding_type = 'pre'
truncating_type = 'post'
padding_value = 0

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X, padding=padding_type, truncating=truncating_type, value=padding_value)
token_maxlen = len(X[0])

In [54]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length= X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout = 0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [60]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(8583, 28) (8583, 2)
(2146, 28) (2146, 2)


In [65]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 1, validation_data = (X_test, Y_test))

Train on 8583 samples, validate on 2146 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7ff3792e40b8>

In [131]:
## save model
import pickle

#model.save('model_sentiment_v1.h5')
config = {'tokenizer': {'tokenizer': tokenizer, 'token_maxlen': token_maxlen, 'padding_type': padding_type,
                       'padding_value': padding_value, 'truncating_type': truncating_type}}
pickle.dump(config, file = open('./cfg/config.pkl', 'wb'))

In [18]:
import pickle
import numpy as np
import keras
from keras.preprocessing.sequence import pad_sequences

model = keras.models.load_model('./cfg/model_sentiment_v1.h5')
config = pickle.load(open('./cfg/config.pkl', 'rb'))

In [19]:
def predict_sentiment(text, model, config):
    text = [text] if type(text) == np.str else text
    tokenizer = config['tokenizer']['tokenizer']
    text = tokenizer.texts_to_sequences(text)
    text = pad_sequences(text, maxlen     = config['tokenizer']['token_maxlen'],
                               padding    = config['tokenizer']['padding_type'],
                               truncating = config['tokenizer']['truncating_type'],
                               value      = config['tokenizer']['padding_value'])
    sentiment = model.predict(text,batch_size=1,verbose = 0)[0]
    argmax_sent = np.argmax(sentiment)
    sentiment_text  = 'Positive' if  argmax_sent == 1 else 'Negative'
    sentiment_score = sentiment[argmax_sent]
    return((sentiment_text, sentiment_score))    

In [22]:
text = ["I can't love someone more than you"]
predict_sentiment(text, model, config)

('Positive', 0.82680482)