In [1]:
# Imports
from keras.datasets import imdb
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Concatenate, GRU
from keras.layers.embeddings import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense, Conv1D, Flatten,MaxPooling1D

In [2]:
# File descriptors
test_file = "atsa_test.csv"
train_file = "atsa_train.csv"

In [3]:
# Pre processing test data
test_data = pd.read_csv(test_file)
test_data['review'] = test_data['review'].astype(str)
test_data['review'] = test_data['review'].str.lower()
test_data

Unnamed: 0,review,aspect,sentiment
0,the bread is top notch as well.,bread,positive
1,i have to say they have one of the fastest del...,delivery times,positive
2,food is always fresh and hot- ready to eat!,food,positive
3,did i mention that the coffee is outstanding?,coffee,positive
4,"certainly not the best sushi in new york, howe...",sushi,conflict
...,...,...,...
1129,"creamy appetizers--taramasalata, eggplant sala...",creamy appetizers,positive
1130,"creamy appetizers--taramasalata, eggplant sala...",warm pitas,neutral
1131,"creamy appetizers--taramasalata, eggplant sala...",taramasalata,positive
1132,"creamy appetizers--taramasalata, eggplant sala...",eggplant salad,positive


In [4]:
# Pre processing train data
train_data = pd.read_csv(train_file)
train_data['review'] = train_data['review'].astype(str)
train_data['review'] = train_data['review'].str.lower()
train_data

Unnamed: 0,review,aspect,sentiment
0,but the waitstaff was so horrible to us.,waitstaff,negative
1,"to be completely fair, the only redeeming fact...",food,positive
2,"the food is uniformly exceptional, with a very...",food,positive
3,"the food is uniformly exceptional, with a very...",kitchen,positive
4,"the food is uniformly exceptional, with a very...",menu,neutral
...,...,...,...
3688,each table has a pot of boiling water sunken i...,pot of boiling water,neutral
3689,each table has a pot of boiling water sunken i...,meats,neutral
3690,each table has a pot of boiling water sunken i...,vegetables,neutral
3691,each table has a pot of boiling water sunken i...,rice,neutral


In [5]:
# List of stopwords
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

In [6]:
#Functions to process data using the stopwords list
def remove_stopwords(data):
    data['review without stopwords'] = data['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result

In [7]:
# Processing train data by removing stop words from reviews
train_data_without_stopwords = remove_stopwords(train_data)
train_data_without_stopwords['clean_review'] = train_data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
train_data_without_stopwords['clean_review'] = train_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  train_data_without_stopwords['clean_review'] = train_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [8]:
train_data_without_stopwords

Unnamed: 0,review,aspect,sentiment,review without stopwords,clean_review
0,but the waitstaff was so horrible to us.,waitstaff,negative,waitstaff horrible us.,waitstaff horrible us
1,"to be completely fair, the only redeeming fact...",food,positive,"completely fair, redeeming factor food, averag...",completely fair redeeming factor food averag...
2,"the food is uniformly exceptional, with a very...",food,positive,"food uniformly exceptional, capable kitchen wi...",food uniformly exceptional capable kitchen wi...
3,"the food is uniformly exceptional, with a very...",kitchen,positive,"food uniformly exceptional, capable kitchen wi...",food uniformly exceptional capable kitchen wi...
4,"the food is uniformly exceptional, with a very...",menu,neutral,"food uniformly exceptional, capable kitchen wi...",food uniformly exceptional capable kitchen wi...
...,...,...,...,...,...
3688,each table has a pot of boiling water sunken i...,pot of boiling water,neutral,"table pot boiling water sunken surface, get pl...",table pot boiling water sunken surface get pl...
3689,each table has a pot of boiling water sunken i...,meats,neutral,"table pot boiling water sunken surface, get pl...",table pot boiling water sunken surface get pl...
3690,each table has a pot of boiling water sunken i...,vegetables,neutral,"table pot boiling water sunken surface, get pl...",table pot boiling water sunken surface get pl...
3691,each table has a pot of boiling water sunken i...,rice,neutral,"table pot boiling water sunken surface, get pl...",table pot boiling water sunken surface get pl...


In [9]:
# Processing test data by removing stop words from reviews
test_data_without_stopwords = remove_stopwords(test_data)
test_data_without_stopwords['clean_review']= test_data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
test_data_without_stopwords['clean_review'] = test_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  test_data_without_stopwords['clean_review'] = test_data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [10]:
test_data_without_stopwords

Unnamed: 0,review,aspect,sentiment,review without stopwords,clean_review
0,the bread is top notch as well.,bread,positive,bread top notch well.,bread top notch well
1,i have to say they have one of the fastest del...,delivery times,positive,say one fastest delivery times city.,say one fastest delivery times city
2,food is always fresh and hot- ready to eat!,food,positive,food always fresh hot- ready eat!,food always fresh hot ready eat
3,did i mention that the coffee is outstanding?,coffee,positive,mention coffee outstanding?,mention coffee outstanding
4,"certainly not the best sushi in new york, howe...",sushi,conflict,"certainly not best sushi new york, however, al...",certainly not best sushi new york however al...
...,...,...,...,...,...
1129,"creamy appetizers--taramasalata, eggplant sala...",creamy appetizers,positive,"creamy appetizers--taramasalata, eggplant sala...",creamy appetizers taramasalata eggplant sala...
1130,"creamy appetizers--taramasalata, eggplant sala...",warm pitas,neutral,"creamy appetizers--taramasalata, eggplant sala...",creamy appetizers taramasalata eggplant sala...
1131,"creamy appetizers--taramasalata, eggplant sala...",taramasalata,positive,"creamy appetizers--taramasalata, eggplant sala...",creamy appetizers taramasalata eggplant sala...
1132,"creamy appetizers--taramasalata, eggplant sala...",eggplant salad,positive,"creamy appetizers--taramasalata, eggplant sala...",creamy appetizers taramasalata eggplant sala...


In [11]:
# Converting data into a list, seperating reviews from sentiment polarity
train_reviews_list = []
train_sentiment = []
train_aspect = []
for i in range(len(train_data_without_stopwords)):
    train_reviews_list.append(train_data_without_stopwords.iloc[i,3])
    train_sentiment.append(train_data_without_stopwords.loc[i,'sentiment'])
    train_aspect.append(train_data_without_stopwords.loc[i,'aspect'])

In [12]:
# Converting data into a list, seperating reviews from sentiment polarity
test_reviews_list = []
test_sentiment = []
test_aspect = []
for i in range(len(test_data_without_stopwords)):
    test_reviews_list.append(test_data_without_stopwords.iloc[i,3])
    test_sentiment.append(test_data_without_stopwords.loc[i,'sentiment'])
    test_aspect.append(test_data_without_stopwords.loc[i,'aspect'])

In [None]:
#Defining train x and y values
Y_train = np.array(list(map(lambda x: 1 if x=="positive" else 0, train_sentiment)))
X_train = train_reviews_list
X_aspect_train = train_aspect

In [14]:
Y_train

array([0, 1, 1, ..., 0, 0, 0])

In [15]:
#Defining train x and y values
Y_test = np.array(list(map(lambda x: 1 if x=="positive" else 0, test_sentiment)))
X_test = test_reviews_list
X_aspect_test = test_aspect

In [16]:
Y_test

array([1, 1, 1, ..., 1, 1, 1])

In [17]:
# Tokenizing data
tokenizer1 = Tokenizer(num_words=5000)
tokenizer1.fit_on_texts(X_train)
words_to_index = tokenizer1.word_index
tokenizer2 = Tokenizer(num_words=5000)
tokenizer2.fit_on_texts(X_aspect_train)
aspect_to_index = tokenizer2.word_index

In [18]:
# Function to read the GloVe vectors for embedding
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            words_in_the_line = line.split()
            current_word = words_in_the_line[0]
            word_to_vec_map[current_word] = np.array(words_in_the_line[1:], dtype=np.float64)
    return word_to_vec_map

In [19]:
# Loading the GloVe vectors
word_to_vec_map = read_glove_vector('glove.6B.300d.txt')

maxLen = 300

In [20]:
# Embedding
vocab_len = len(words_to_index)+1
embed_vector_len = maxLen

embed_matrix = np.zeros((vocab_len, embed_vector_len))
hits = 0
count = 0
for word, index in words_to_index.items():
    count+=1
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        hits+=1
        embed_matrix[index, :] = embedding_vector
print("hits : ", hits," misses : ",count-hits)
embedding_layer = Embedding(input_dim=vocab_len,
                            output_dim=embed_vector_len,
                            input_length=maxLen, weights = [embed_matrix],
                            trainable=False)

vocab_asp_len = len(aspect_to_index)+1
embed_vector_len = maxLen

embed_matrix = np.zeros((vocab_asp_len, embed_vector_len))
hits = 0
count = 0
for word, index in aspect_to_index.items():
    count+=1
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        hits+=1
        embed_matrix[index, :] = embedding_vector
print("hits : ", hits," misses : ",count-hits)
aspect_embedding_layer = Embedding(input_dim=vocab_asp_len,
                            output_dim=embed_vector_len,
                            input_length=maxLen, weights = [embed_matrix],
                            trainable=False)

hits :  3478  misses :  141
hits :  1062  misses :  32


In [21]:
embedding_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fe9abd10a30>

In [22]:
aspect_embedding_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fe9abd107c0>

In [23]:
X_train_indices = tokenizer1.texts_to_sequences(X_train)

X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

X_aspect_indices = tokenizer2.texts_to_sequences(X_aspect_train)

X_aspect_indices = pad_sequences(X_aspect_indices, maxlen=maxLen, padding='post')

In [24]:
def cnn(words,aspects):
    vocab_len = len(words_to_index)+1
    embed_vector_len = maxLen

    embed_matrix = np.zeros((vocab_len, embed_vector_len))

    for word, index in words_to_index.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            embed_matrix[index, :] = embedding_vector
    input1 = Input(shape=(300,))
    embedding_layer = Embedding(input_dim=vocab_len,
                                output_dim=embed_vector_len,
                                input_length=maxLen, weights = [embed_matrix],
                                trainable=False)(input1)
    
    x1 = Conv1D(32,8,activation='relu')(embedding_layer)
    x1 = MaxPooling1D(pool_size=4)(x1)
    #x1 = Flatten()(x1)
    
    vocab_asp_len = len(aspect_to_index)+1
    embed_vector_len = maxLen

    embed_matrix = np.zeros((vocab_asp_len, embed_vector_len))

    for word, index in aspect_to_index.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            embed_matrix[index, :] = embedding_vector
    input2 = Input(shape=(300,))
    aspect_embedding_layer = Embedding(input_dim=vocab_asp_len,
                                output_dim=embed_vector_len,
                                input_length=maxLen, weights = [embed_matrix],
                                trainable=False)(input2)
    
    x2 = Conv1D(32,8,activation='relu')(aspect_embedding_layer)
    x2 = MaxPooling1D(pool_size=4)(x2)
    #x2 = Flatten()(x2)
    
    concat = Concatenate()([x1,x2])
    concat = Dense(64,activation='relu')(concat)
    concat = Dense(32,activation='tanh')(concat)
    #concat = GRU(16,activation='tanh',recurrent_activation='relu')(concat)
    concat = MaxPooling1D(pool_size=2)(concat)
    concat = Flatten()(concat)
    concat = Dense(1,activation='sigmoid')(concat)
    model = Model(inputs=[input1,input2],outputs=[concat])
    print(model.summary())
    return model

In [25]:
model = cnn(words_to_index,aspect_to_index)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 300)     1086000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 300, 300)     328500      input_2[0][0]                    
______________________________________________________________________________________________

In [26]:
adagrad = keras.optimizers.Adagrad(learning_rate = 0.01)

model.compile(optimizer=adagrad, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x=[X_train_indices,X_aspect_indices], y=Y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe9a9602a00>

In [27]:
# Formating test data so that we can use it
X_test_indices = tokenizer1.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

X_test_aspect_indices = tokenizer2.texts_to_sequences(X_aspect_test)

X_test_aspect_indices = pad_sequences(X_test_aspect_indices, maxlen=maxLen, padding='post')

In [28]:
model.evaluate([X_test_indices,X_test_aspect_indices], Y_test)



[0.48963725566864014, 0.7795414328575134]

In [29]:
model.evaluate([X_train_indices,X_aspect_indices], Y_train)



[0.24655933678150177, 0.913078784942627]

In [48]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
model1 = cnn(words_to_index,aspect_to_index)
# compile the model
model1.compile(optimizer=adagrad, loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
model1.fit(x=[X_train_indices,X_aspect_indices], y=Y_train, batch_size=32, epochs=10)

# evaluate the model
loss, accuracy, f1_score, precision, recall = model1.evaluate([X_test_indices,X_test_aspect_indices], Y_test, verbose=0)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 300, 300)     1086000     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 300, 300)     328500      input_6[0][0]                    
____________________________________________________________________________________________

In [49]:
print("loss : " ,loss)
print("accuracy : ",accuracy)
print("f1_score : ",f1_score)
print("precision : ",precision)

loss :  0.4991653561592102
accuracy :  0.7751322984695435
f1_score :  0.8206244707107544
precision :  0.7658053636550903


In [None]:
# Model definition for a simple CNN
# def test_cnn(emb,asp_emb):
#     embedding_layer = Embedding(input_dim=vocab_len,
#                             output_dim=embed_vector_len,
#                             input_length=maxLen, weights = [embed_matrix],
#                             trainable=False)
#     model1 = Sequential()
#     model1.add(emb)
#     model1.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
#     model1.add(MaxPooling1D(pool_size=4))
#     model1.add(Flatten())
#     print(model1.summary())
#     ##
#     model2 = Sequential()
#     model2.add(asp_emb)
#     model2.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
#     model2.add(MaxPooling1D(pool_size=4))
#     model2.add(Flatten())
#     print(model2.summary())
#     concat = Concatenate([model1,model2])
#     # ADD GTRU
#     ##
#     '''
#     model3.add(MaxPooling1D(pool_size=2))
#     model3.add(Flatten())
#     model3.add(Dense(10, activation='relu'))
#     model3.add(Dense(1, activation='sigmoid'))
#     print(model3.summary())
#     return model3
#     x = activation("Relu")
    
#     x = activation("Tanh")
#     concat = MaxPooling1D(pool_size=2)(concat)
#     concat = Flatten()(concat)
#     concat = Dense(10,activation='relu')(concat)
#     concat = Dense(1,activation='relu')(concat)
#     '''
#     model = Model(inputs=[concat],outputs=[concat])
#     print(model.summary())
#     return model

In [None]:
# Model definition for a LSTM based model
# def LSTM_model(input_shape):
#     X_indices = Input(input_shape)
#     embeddings = embedding_layer(X_indices)
#     X = LSTM(128, return_sequences=True)(embeddings)
#     X = Dropout(0.6)(X)
#     X = LSTM(128, return_sequences=True)(X)
#     X = Dropout(0.6)(X)
#     X = LSTM(128)(X)
#     X = Dense(1, activation='sigmoid')(X)
#     model = Model(inputs=X_indices, outputs=X)
#     print(model.summary())
#     return model

In [None]:
# def CNN_LSTM_model(emb):
#     embedding_vecor_length = maxLen
#     model = Sequential()
#     model.add(emb)
#     model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(LSTM(256))
#     model.add(Dense(128, activation='sigmoid'))
#     model.add(Dense(64, activation='sigmoid'))
#     model.add(Dense(1, activation='softmax'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     print(model.summary())
#     return model

In [None]:
# ''' Uncommenting out one line would run the function and here, the model is based on LSTM and CNN '''
# # model = test_cnn(embedding_layer, aspect_embedding_layer)
# model = LSTM_model(768)
# # model = CNN_LSTM_model(embedding_layer)

In [None]:
# # Running the model
# '''
# adam = keras.optimizers.Adam(learning_rate = 0.0001)

# '''
# adam = keras.optimizers.Adam(learning_rate = 0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)

# model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

# # model.fit(X_train_indices, Y_train, batch_size=64, epochs=5)
# model.fit(sentence_embeddings, Y_train, batch_size=64, epochs=5)

In [None]:
# # Formating test data so that we can use it
# X_test_indices = tokenizer1.texts_to_sequences(X_test)

# X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

# X_test_indices

In [None]:
# Model evaluation for test data
# model.evaluate(X_test_indices, Y_test)

In [None]:
# Model evaluation for train data
# model.evaluate(X_train_indices, Y_train)

In [33]:
%matplotlib inline

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

[nltk_data] Downloading package stopwords to /home/aswin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [35]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, Y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(Y_test, prediction_linear, output_dict=True)

Training time: 1.632158s; Prediction time: 0.321239s


In [36]:
print('positive: ', report['1'])
print('negative: ', report['0'])

positive:  {'precision': 0.8021248339973439, 'recall': 0.8296703296703297, 'f1-score': 0.8156650911546253, 'support': 728}
negative:  {'precision': 0.6745406824146981, 'recall': 0.6330049261083743, 'f1-score': 0.653113087674714, 'support': 406}


In [37]:
len(X_train)

3693

In [38]:
accuracy_score(Y_test,prediction_linear)

0.7592592592592593

In [39]:
!pip install sentence-transformers



In [40]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [41]:
sentence_embeddings = sbert_model.encode(X_train)

In [42]:
len(sentence_embeddings)

3693

In [43]:
sentence_embeddings[0]

array([ 4.34444696e-01,  6.47295833e-01,  1.86367583e+00,  1.16021030e-01,
       -1.98262960e-01,  7.22987771e-01,  1.37537980e+00,  5.59267998e-01,
        3.41259331e-01, -2.56377518e-01, -1.29588282e+00,  3.47438246e-01,
        4.10151482e-01,  2.38022372e-01, -1.63080722e-01,  5.38577139e-01,
       -3.50794792e-01, -6.85997307e-01,  2.90022790e-01, -9.21145916e-01,
        4.12595004e-01, -3.63073796e-01,  1.18249321e+00, -6.41812980e-01,
       -1.45813480e-01,  5.96415885e-02,  1.23967035e-02, -1.40196776e+00,
       -1.20230460e+00,  1.45145491e-01,  1.60908565e-01, -6.85921133e-01,
        6.51957810e-01, -2.02278748e-01, -3.32574606e-01,  5.61232686e-01,
       -9.10916179e-03,  6.21988289e-02,  1.46808401e-01,  3.51528913e-01,
        9.72895086e-01,  4.83499199e-01,  6.67923510e-01,  4.43095148e-01,
       -1.51802540e-01,  5.94743602e-02,  1.21916282e+00,  2.90922940e-01,
        8.32786143e-01, -1.08085716e+00, -9.73090172e-01,  8.80242661e-02,
        1.77330136e+00,  

In [44]:
query_vec = sbert_model.encode(X_test)

In [45]:
len(query_vec[0])

768

In [46]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(sentence_embeddings, Y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(query_vec)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(Y_test, prediction_linear, output_dict=True)

Training time: 7.230906s; Prediction time: 0.466834s


In [47]:
accuracy_score(Y_test,prediction_linear)

0.7839506172839507