In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, MaxPooling1D, Flatten
import pandas as pd
from preprocessing import *
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import numpy as np

In [2]:
glove_twitter_25 = []

In [2]:
print("loading pre trained embeddings, this can take some minutes...")
glove_twitter_25 = KeyedVectors.load_word2vec_format('glove-twitter-25.txt', binary=False)
print("loading complete.")

loading pre trained embeddings, this can take some minutes...
loading complete.


In [11]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(inplace = True)
print(dataset.shape)

(3150, 5)
(3150, 5)
(2998, 5)
(2322, 5)


In [12]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [13]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 442, 0: 221})


In [14]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [15]:
new_text, new_sent_tok = tokenize_list_of_text(X_temp, custom_stopwords, False, pos_list)

total number of types extracted is: 1810


In [16]:
def frequency_cleaning(new_sent_tok, n):
    
    tot_tokens = []

    for sent in new_sent_tok:
        for tok in sent:
            tot_tokens.append(tok)

    freqs = nltk.FreqDist(tot_tokens)
    cleaned_reviews = []

    for sent in new_sent_tok:
        clean_sent = []
        for tok in sent:
            if freqs[tok] > n:
                clean_sent.append(tok)
        cleaned_reviews.append(clean_sent)

    return cleaned_reviews

In [17]:
cleaned_reviews = frequency_cleaning(new_sent_tok, 2)

In [18]:
bigrams = Phrases(cleaned_reviews, scoring="npmi", threshold=0.60) #estrae le collocazioni tramite PMI
bigrams[cleaned_reviews][2]

['little', 'feature']

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(bigrams[cleaned_reviews], y, test_size=0.20, random_state=10)

In [21]:
t = Tokenizer()
t.fit_on_texts(X_train)
X_train_encoded = t.texts_to_sequences(X_train)
max_length = len(max(bigrams[cleaned_reviews], key = len))
Xtrain = pad_sequences(X_train_encoded, maxlen=max_length, padding='post')

In [22]:
X_test_encoded = t.texts_to_sequences(X_test)
Xtest = pad_sequences(X_test_encoded, maxlen=max_length, padding='post')

In [23]:
vocab_size = len(t.word_index) + 1

In [24]:
from imblearn.under_sampling import TomekLinks

undersampler_nn = TomekLinks(sampling_strategy="all") # use tomeLinks to remove ambiguous data through nearest neighbours


# Fit and transform the X and y data
X_train_resampled, y_train_resampled = undersampler_nn.fit_resample(Xtrain, Y_train)


print('Resampled dataset shape %s' % Counter(y_train_resampled))

Resampled dataset shape Counter({1: 325, 0: 153})


In [25]:
from keras.utils import to_categorical
Y_train_hot = to_categorical(y_train_resampled)
Y_test_hot = to_categorical(Y_test)
Y_train_hot

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.

In [27]:
from keras.callbacks import EarlyStopping

# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='binary_accuracy', patience=25)

model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_length, trainable = True))
model.add(Dropout(0.2))
model.add(Conv1D(filters=16, kernel_size=10, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 98, 300)           185700    
                                                                 
 dropout_5 (Dropout)         (None, 98, 300)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 89, 16)            48016     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 44, 16)           0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 704)               0         
                                                                 
 dropout_6 (Dropout)         (None, 704)               0         
                                                      

In [28]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
# fit network
model.fit(X_train_resampled, Y_train_hot, epochs=100, verbose=2, validation_split=0.1, callbacks = [early_stopping])

Epoch 1/100
14/14 - 3s - loss: 0.6574 - binary_accuracy: 0.6674 - val_loss: 0.6714 - val_binary_accuracy: 0.6979 - 3s/epoch - 182ms/step
Epoch 2/100
14/14 - 1s - loss: 0.6406 - binary_accuracy: 0.6802 - val_loss: 0.6004 - val_binary_accuracy: 0.6875 - 756ms/epoch - 54ms/step
Epoch 3/100
14/14 - 1s - loss: 0.6008 - binary_accuracy: 0.6837 - val_loss: 0.5958 - val_binary_accuracy: 0.7396 - 782ms/epoch - 56ms/step
Epoch 4/100
14/14 - 1s - loss: 0.5495 - binary_accuracy: 0.7605 - val_loss: 0.6360 - val_binary_accuracy: 0.7292 - 715ms/epoch - 51ms/step
Epoch 5/100
14/14 - 1s - loss: 0.4586 - binary_accuracy: 0.8326 - val_loss: 0.6518 - val_binary_accuracy: 0.7083 - 636ms/epoch - 45ms/step
Epoch 6/100
14/14 - 1s - loss: 0.3188 - binary_accuracy: 0.8837 - val_loss: 0.7454 - val_binary_accuracy: 0.7500 - 642ms/epoch - 46ms/step
Epoch 7/100
14/14 - 1s - loss: 0.2263 - binary_accuracy: 0.9163 - val_loss: 0.7030 - val_binary_accuracy: 0.7708 - 624ms/epoch - 45ms/step
Epoch 8/100
14/14 - 1s - loss

<keras.callbacks.History at 0x1a5e52a32e0>

In [29]:
predictions = model.predict(Xtest)



In [30]:
predictions = np.round(predictions)

In [31]:
from sklearn.metrics import classification_report
print(classification_report(Y_test_hot, predictions))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79        42
           1       0.93      0.87      0.90        91

   micro avg       0.86      0.86      0.86       133
   macro avg       0.83      0.86      0.84       133
weighted avg       0.87      0.86      0.86       133
 samples avg       0.86      0.86      0.86       133



In [89]:
import gensim
print(gensim.__version__)

4.2.0
