In [1]:
# Imports
from keras.datasets import imdb
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Concatenate, GRU
from keras.layers.embeddings import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense, Conv1D, Flatten,MaxPooling1D

In [2]:
# File descriptors
test_file = "acsa_test.csv"
train_file = "acsa_train.csv"

In [3]:
# Pre processing test data
test_data = pd.read_csv(test_file)
test_data['review'] = test_data['review'].astype(str)
test_data['review'] = test_data['review'].str.lower()
test_data

Unnamed: 0,review,aspect,sentiment
0,the bread is top notch as well.,food,positive
1,i have to say they have one of the fastest del...,service,positive
2,food is always fresh and hot- ready to eat!,food,positive
3,did i mention that the coffee is outstanding?,food,positive
4,"certainly not the best sushi in new york, howe...",ambience,positive
...,...,...,...
874,"i have never in my life sent back food before,...",food,negative
875,"i have never in my life sent back food before,...",service,negative
876,"although the restaurant itself is nice, i pref...",ambience,positive
877,"although the restaurant itself is nice, i pref...",food,negative


In [4]:
# Pre processing train data
train_data = pd.read_csv(train_file)
train_data['review'] = train_data['review'].astype(str)
train_data['review'] = train_data['review'].str.lower()
train_data

Unnamed: 0,review,aspect,sentiment
0,but the waitstaff was so horrible to us.,service,negative
1,"to be completely fair, the only redeeming fact...",food,positive
2,"to be completely fair, the only redeeming fact...",misc,negative
3,"the food is uniformly exceptional, with a very...",food,positive
4,where gabriela personally greets you and recom...,service,positive
...,...,...,...
3013,i'm partial to the gnocchi.,food,positive
3014,"note that they do not serve beer, you must bri...",service,negative
3015,"better than the bagel shop on the corner, but ...",misc,negative
3016,but that is highly forgivable.,misc,positive


In [5]:
# List of stopwords
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

In [6]:
#Functions to process data using the stopwords list
def remove_stopwords(data):
    data['review without stopwords'] = data['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result

In [7]:
# Processing train data by removing stop words from reviews
train_data_without_stopwords = remove_stopwords(train_data)
train_data_without_stopwords['clean_review'] = train_data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
train_data_without_stopwords['clean_review'] = train_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  train_data_without_stopwords['clean_review'] = train_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [8]:
train_data_without_stopwords

Unnamed: 0,review,aspect,sentiment,review without stopwords,clean_review
0,but the waitstaff was so horrible to us.,service,negative,waitstaff horrible us.,waitstaff horrible us
1,"to be completely fair, the only redeeming fact...",food,positive,"completely fair, redeeming factor food, averag...",completely fair redeeming factor food averag...
2,"to be completely fair, the only redeeming fact...",misc,negative,"completely fair, redeeming factor food, averag...",completely fair redeeming factor food averag...
3,"the food is uniformly exceptional, with a very...",food,positive,"food uniformly exceptional, capable kitchen wi...",food uniformly exceptional capable kitchen wi...
4,where gabriela personally greets you and recom...,service,positive,gabriela personally greets recommends eat.,gabriela personally greets recommends eat
...,...,...,...,...,...
3013,i'm partial to the gnocchi.,food,positive,partial gnocchi.,partial gnocchi
3014,"note that they do not serve beer, you must bri...",service,negative,"note not serve beer, must bring own.",note not serve beer must bring own
3015,"better than the bagel shop on the corner, but ...",misc,negative,"better bagel shop corner, not worth going way ...",better bagel shop corner not worth going way ...
3016,but that is highly forgivable.,misc,positive,highly forgivable.,highly forgivable


In [9]:
# Processing test data by removing stop words from reviews
test_data_without_stopwords = remove_stopwords(test_data)
test_data_without_stopwords['clean_review']= test_data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
test_data_without_stopwords['clean_review'] = test_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  test_data_without_stopwords['clean_review'] = test_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [10]:
test_data_without_stopwords

Unnamed: 0,review,aspect,sentiment,review without stopwords,clean_review
0,the bread is top notch as well.,food,positive,bread top notch well.,bread top notch well
1,i have to say they have one of the fastest del...,service,positive,say one fastest delivery times city.,say one fastest delivery times city
2,food is always fresh and hot- ready to eat!,food,positive,food always fresh hot- ready eat!,food always fresh hot ready eat
3,did i mention that the coffee is outstanding?,food,positive,mention coffee outstanding?,mention coffee outstanding
4,"certainly not the best sushi in new york, howe...",ambience,positive,"certainly not best sushi new york, however, al...",certainly not best sushi new york however al...
...,...,...,...,...,...
874,"i have never in my life sent back food before,...",food,negative,"never life sent back food before, simply to, w...",never life sent back food before simply to w...
875,"i have never in my life sent back food before,...",service,negative,"never life sent back food before, simply to, w...",never life sent back food before simply to w...
876,"although the restaurant itself is nice, i pref...",ambience,positive,"although restaurant nice, prefer not go food.",although restaurant nice prefer not go food
877,"although the restaurant itself is nice, i pref...",food,negative,"although restaurant nice, prefer not go food.",although restaurant nice prefer not go food


In [11]:
# Converting data into a list, seperating reviews from sentiment polarity
train_reviews_list = []
train_sentiment = []
train_aspect = []
for i in range(len(train_data_without_stopwords)):
    train_reviews_list.append(train_data_without_stopwords.iloc[i,3])
    train_sentiment.append(train_data_without_stopwords.loc[i,'sentiment'])
    train_aspect.append(train_data_without_stopwords.loc[i,'aspect'])

In [12]:
# Converting data into a list, seperating reviews from sentiment polarity
test_reviews_list = []
test_sentiment = []
test_aspect = []
for i in range(len(test_data_without_stopwords)):
    test_reviews_list.append(test_data_without_stopwords.iloc[i,3])
    test_sentiment.append(test_data_without_stopwords.loc[i,'sentiment'])
    test_aspect.append(test_data_without_stopwords.loc[i,'aspect'])

In [13]:
#Defining train x and y values
Y_train = np.array(list(map(lambda x: 1 if x=="positive" else 0, train_sentiment)))
X_train = train_reviews_list
X_aspect_train = train_aspect

In [14]:
Y_train

array([0, 1, 0, ..., 0, 1, 1])

In [15]:
#Defining train x and y values
Y_test = np.array(list(map(lambda x: 1 if x=="positive" else 0, test_sentiment)))
X_test = test_reviews_list
X_aspect_test = test_aspect

In [16]:
Y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,

In [17]:
# Tokenizing data
tokenizer1 = Tokenizer(num_words=5000)
tokenizer1.fit_on_texts(X_train)
words_to_index = tokenizer1.word_index
tokenizer2 = Tokenizer(num_words=5000)
tokenizer2.fit_on_texts(X_aspect_train)
aspect_to_index = tokenizer2.word_index

In [18]:
# Function to read the GloVe vectors for embedding
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            words_in_the_line = line.split()
            current_word = words_in_the_line[0]
            word_to_vec_map[current_word] = np.array(words_in_the_line[1:], dtype=np.float64)
    return word_to_vec_map

In [19]:
# Loading the GloVe vectors
word_to_vec_map = read_glove_vector('./glove.6B.300d.txt')

maxLen = 300

In [20]:
# Embedding
vocab_len = len(words_to_index)+1
embed_vector_len = maxLen

embed_matrix = np.zeros((vocab_len, embed_vector_len))
hits = 0
count = 0
for word, index in words_to_index.items():
    count +=1
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        hits += 1
        embed_matrix[index, :] = embedding_vector
print("hits : ", hits," misses : ",count-hits)
embedding_layer = Embedding(input_dim=vocab_len,
                            output_dim=embed_vector_len,
                            input_length=maxLen, weights = [embed_matrix],
                            trainable=False)

vocab_asp_len = len(aspect_to_index)+1
embed_vector_len = maxLen

embed_matrix = np.zeros((vocab_asp_len, embed_vector_len))

for word, index in aspect_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        embed_matrix[index, :] = embedding_vector

aspect_embedding_layer = Embedding(input_dim=vocab_asp_len,
                            output_dim=embed_vector_len,
                            input_length=maxLen, weights = [embed_matrix],
                            trainable=False)

hits :  3611  misses :  158


In [21]:
embedding_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fc02c585df0>

In [22]:
aspect_embedding_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fbfdf9f4b20>

In [23]:
X_train_indices = tokenizer1.texts_to_sequences(X_train)

X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

X_aspect_indices = tokenizer2.texts_to_sequences(X_aspect_train)

X_aspect_indices = pad_sequences(X_aspect_indices, maxlen=maxLen, padding='post')

In [24]:
def CNN_LSTM_model(emb):
    embedding_vecor_length = maxLen
    model = Sequential()
    model.add(emb)
    model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(256))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [25]:
# Model definition for a LSTM based model
def LSTM_model(input_shape):
    X_indices = Input(input_shape)
    embeddings = embedding_layer(X_indices)
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.6)(X)
    X = LSTM(128, return_sequences=True)(X)
    X = Dropout(0.6)(X)
    X = LSTM(128)(X)
    X = Dense(1, activation='sigmoid')(X)
    model = Model(inputs=X_indices, outputs=X)
    print(model.summary())
    return model

In [26]:
# Model definition for a simple CNN
def simple_cnn(emb):
    model = Sequential()
    model.add(emb)
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    return model

In [27]:
def cnn(words,aspects):
    vocab_len = len(words_to_index)+1
    embed_vector_len = maxLen

    embed_matrix = np.zeros((vocab_len, embed_vector_len))

    for word, index in words_to_index.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            embed_matrix[index, :] = embedding_vector
    input1 = Input(shape=(300,))
    embedding_layer = Embedding(input_dim=vocab_len,
                                output_dim=embed_vector_len,
                                input_length=maxLen, weights = [embed_matrix],
                                trainable=False)(input1)
    
    x1 = Conv1D(32,8,activation='relu')(embedding_layer)
    x1 = MaxPooling1D(pool_size=4)(x1)
    #x1 = Flatten()(x1)
    
    vocab_asp_len = len(aspect_to_index)+1
    embed_vector_len = maxLen

    embed_matrix = np.zeros((vocab_asp_len, embed_vector_len))

    for word, index in aspect_to_index.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            embed_matrix[index, :] = embedding_vector
    input2 = Input(shape=(300,))
    aspect_embedding_layer = Embedding(input_dim=vocab_asp_len,
                                output_dim=embed_vector_len,
                                input_length=maxLen, weights = [embed_matrix],
                                trainable=False)(input2)
    x2 = Conv1D(32,8,activation='relu')(aspect_embedding_layer)
    x2 = MaxPooling1D(pool_size=4)(x2)
    #x2 = Flatten()(x2)
    
    concat = Concatenate()([x1,x2])
    concat = Dense(64,activation='relu')(concat)
    concat = Dense(32,activation='tanh')(concat)
    #concat = GRU(16,activation='tanh',recurrent_activation='relu')(concat)
    concat = MaxPooling1D(pool_size=2)(concat)
    concat = Flatten()(concat)
    concat = Dense(1,activation='sigmoid')(concat)
    model = Model(inputs=[input1,input2],outputs=[concat])
    print(model.summary())
    return model

In [28]:
model = cnn(words_to_index,aspect_to_index)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 300)     1131000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 300, 300)     1800        input_2[0][0]                    
______________________________________________________________________________________________

In [29]:
adagrad = keras.optimizers.Adagrad(learning_rate = 0.01)

model.compile(optimizer=adagrad, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x=[X_train_indices,X_aspect_indices], y=Y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbfdf8eb610>

In [52]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
model1 = cnn(words_to_index,aspect_to_index)
# compile the model
model1.compile(optimizer=adagrad, loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
model1.fit(x=[X_train_indices,X_aspect_indices], y=Y_train, batch_size=32, epochs=10)

# evaluate the model
loss, accuracy, f1_score, precision, recall = model1.evaluate([X_test_indices,X_test_aspect_indices], Y_test, verbose=0)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 300, 300)     1131000     input_6[0][0]                    
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 300, 300)     1800        input_7[0][0]                    
____________________________________________________________________________________________

In [53]:
print("loss : " ,loss)
print("accuracy : ",accuracy)
print("f1_score : ",f1_score)
print("precision : ",precision)

loss :  0.3269771933555603
accuracy :  0.8623435497283936
f1_score :  0.9085442423820496
precision :  0.873866856098175


In [30]:
# Formating test data so that we can use it
X_test_indices = tokenizer1.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

X_test_aspect_indices = tokenizer2.texts_to_sequences(X_aspect_test)

X_test_aspect_indices = pad_sequences(X_test_aspect_indices, maxlen=maxLen, padding='post')

In [31]:
model.evaluate([X_test_indices,X_test_aspect_indices], Y_test)



[0.3683435320854187, 0.8407281041145325]

In [32]:
model.evaluate([X_train_indices,X_aspect_indices], Y_train)



[0.29890453815460205, 0.890656054019928]

In [41]:
''' Uncommenting out one line would run the function and here, the model is based on LSTM and CNN '''
# model = simple_cnn(embedding_layer)
model = LSTM_model(300)
# model = CNN_LSTM_model(embedding_layer)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 300)          1131000   
_________________________________________________________________
lstm (LSTM)                  (None, 300, 128)          219648    
_________________________________________________________________
dropout (Dropout)            (None, 300, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 300, 128)          131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               1315

In [42]:
# Running the model
'''
adam = keras.optimizers.Adam(learning_rate = 0.0001)

'''
adam = keras.optimizers.Adam(learning_rate = 0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(x = X_train,y = Y_train, batch_size=64, epochs=5)
model.fit(X_train_indices, Y_train, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fbfdf8eba60>

In [43]:
# Formating test data so that we can use it
X_test_indices = tokenizer1.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

X_test_indices

array([[ 264,  192,  902, ...,    0,    0,    0],
       [ 122,   11, 1864, ...,    0,    0,    0],
       [   1,   27,   49, ...,    0,    0,    0],
       ...,
       [ 360,    7,   25, ...,    0,    0,    0],
       [ 360,    7,   25, ...,    0,    0,    0],
       [2452,  380,  830, ...,    0,    0,    0]], dtype=int32)

In [44]:
# Model evaluation for test data
model.evaluate(X_test_indices, Y_test)



[0.5665537118911743, 0.7474402785301208]

In [45]:
# Model evaluation for train data
model.evaluate(X_train_indices, Y_train)



[0.5910611152648926, 0.722001314163208]

In [50]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
model2 = simple_cnn(embedding_layer)
# model2 = LSTM_model(300)
# model2 = CNN_LSTM_model(embedding_layer)
# compile the model
model2.compile(optimizer=adagrad, loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
model2.fit(x=X_train_indices, y=Y_train, batch_size=32, epochs=10)

# evaluate the model
loss, accuracy, f1_score, precision, recall = model2.evaluate(X_test_indices, Y_test, verbose=0)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          1131000   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 293, 32)           76832     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 146, 32)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 4672)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                46730     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 11        
Total params: 1,254,573
Trainable params: 123,573
Non-trainable params: 1,131,000
______________________________________

In [51]:
print("loss : " ,loss)
print("accuracy : ",accuracy)
print("f1_score : ",f1_score)
print("precision : ",precision)

loss :  0.3304230272769928
accuracy :  0.8577929735183716
f1_score :  0.9024096727371216
precision :  0.8718743324279785


In [59]:
%matplotlib inline

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

[nltk_data] Downloading package stopwords to /home/aswin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [91]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='sigmoid')
t0 = time.time()
classifier_linear.fit(train_vectors, Y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(Y_test, prediction_linear, output_dict=True)

Training time: 0.271527s; Prediction time: 0.059065s


In [92]:
# print('positive: ', report['1'])
# print('negative: ', report['0'])

In [93]:
accuracy_score(Y_test,prediction_linear)

0.8361774744027304

In [94]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [95]:
sentence_embeddings = sbert_model.encode(X_train)

In [96]:
query_vec = sbert_model.encode(X_test)

In [103]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='rbf')
t0 = time.time()
classifier_linear.fit(sentence_embeddings, Y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(query_vec)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(Y_test, prediction_linear, output_dict=True)

Training time: 1.238427s; Prediction time: 0.398030s


In [104]:
accuracy_score(Y_test,prediction_linear)

0.906712172923777