### Questo notebook contiene l'implementazione di un Convolutional Neural Network che usa come vettori delle recensioni i token id. Questi vettori vengono addestrati nel layer "Embeddings" di keras col parametro trainable = True

In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, MaxPooling1D, Flatten
import pandas as pd
from preprocessing import *
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import numpy as np

In [2]:
w2v_pretrained = Word2Vec.load("w2vPreTrained")

In [3]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(subset = "verified_reviews", inplace = True)
print(dataset.shape)

(3150, 6)
(3150, 6)
(2998, 6)
(2196, 6)


In [4]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [5]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 412, 0: 206})


In [6]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [7]:
new_text, new_sent_tok = tokenize_list_of_text(X_temp, custom_stopwords, False, 2)

total number of types extracted is: 1788


In [8]:
cleaned_reviews = frequency_cleaning(new_sent_tok, 2)

In [9]:
bigrams = Phrases(cleaned_reviews, scoring="npmi", threshold=0.60) #estrae le collocazioni tramite PMI
bigrams[cleaned_reviews][2]

['little', 'feature']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(bigrams[cleaned_reviews], y, test_size=0.20, random_state=10)

In [11]:
negative_reviews = []
for rev, label in zip(X_train, Y_train):
    if label == 0:
        negative_reviews.append(rev)
    
generated_reviews = generate_samples(negative_reviews, int(len(negative_reviews)/2), w2v_pretrained)

# run this only one time
X_train.extend(generated_reviews)
Y_train.extend([0 for x in generated_reviews])

['joke']
-
['laugh']
--------------------------
['price', 'product', 'nice', 'quality', 'nice', 'feature', 'definitely', 'reason', 'give', 'think', 'may', 'buyer', 'error', 'first', 'ignore', 'product', 'plug', 'time', 'work', 'really', 'unlike', 'not_a', 'stand', 'device', 'also', 'speaker', 'not_very', 'loud', 'buy', 'bluetooth', 'speaker', 'sure', 'lot', 'figure', 'use', 'kind', 'seem', 'like', 'device', 'plus', 'set', 'awful', 'would', 'definitely', 'money', 'buy', 'one', 'actually']
-
['cost', 'product', 'decent', 'quality', 'decent', 'feature', 'definitely', 'reason', 'present', 'imagine', 'may', 'buyer', 'error', 'start', 'ignore', 'product', 'plug', 'sentence', 'run', 'truly', 'different', 'not_a', 'base', 'device', 'also', 'speaker', 'not_very', 'loudly', 'purchase', 'bluetooth', 'speaker', 'sure', 'plenty', 'figure', 'utilize', 'kind', 'seem', 'wish', 'device', 'plus', 'adjust', 'terrible', 'would', 'definitely', 'money', 'purchase', 'one', 'really']
-------------------------

In [12]:
t = Tokenizer()
t.fit_on_texts(X_train)
X_train_encoded = t.texts_to_sequences(X_train)
max_length = len(max(bigrams[cleaned_reviews], key = len))
Xtrain = pad_sequences(X_train_encoded, maxlen=max_length, padding='post')

In [13]:
X_test_encoded = t.texts_to_sequences(X_test)
Xtest = pad_sequences(X_test_encoded, maxlen=max_length, padding='post')

In [14]:
vocab_size = len(t.word_index) + 1
vocab_size

701

In [15]:
from keras.utils import to_categorical
Y_train_hot = to_categorical(Y_train)
Y_test_hot = to_categorical(Y_test)

In [38]:
from keras.callbacks import EarlyStopping

# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='binary_accuracy', patience=25)

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length, trainable = True))
model.add(Dropout(0.2))
model.add(Conv1D(filters=16, kernel_size=10, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          70100     
                                                                 
 dropout_10 (Dropout)        (None, 100, 100)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 91, 16)            16016     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 45, 16)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 720)               0         
                                                                 
 dropout_11 (Dropout)        (None, 720)               0         
                                                      

In [39]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
# fit network
model.fit(Xtrain, Y_train_hot, epochs=100, verbose=2, validation_split=0.1, callbacks = [early_stopping], batch_size=16)

Epoch 1/100
33/33 - 3s - loss: 0.6606 - binary_accuracy: 0.6238 - val_loss: 0.9939 - val_binary_accuracy: 0.0000e+00 - 3s/epoch - 105ms/step
Epoch 2/100
33/33 - 1s - loss: 0.6449 - binary_accuracy: 0.6423 - val_loss: 0.9911 - val_binary_accuracy: 0.0000e+00 - 589ms/epoch - 18ms/step
Epoch 3/100
33/33 - 1s - loss: 0.6124 - binary_accuracy: 0.6793 - val_loss: 0.7919 - val_binary_accuracy: 0.3684 - 598ms/epoch - 18ms/step
Epoch 4/100
33/33 - 1s - loss: 0.4678 - binary_accuracy: 0.8012 - val_loss: 0.3938 - val_binary_accuracy: 0.7193 - 661ms/epoch - 20ms/step
Epoch 5/100
33/33 - 1s - loss: 0.2267 - binary_accuracy: 0.9035 - val_loss: 0.2879 - val_binary_accuracy: 0.8772 - 602ms/epoch - 18ms/step
Epoch 6/100
33/33 - 1s - loss: 0.1343 - binary_accuracy: 0.9483 - val_loss: 0.2560 - val_binary_accuracy: 0.8772 - 583ms/epoch - 18ms/step
Epoch 7/100
33/33 - 1s - loss: 0.0532 - binary_accuracy: 0.9815 - val_loss: 0.2221 - val_binary_accuracy: 0.9035 - 586ms/epoch - 18ms/step
Epoch 8/100
33/33 - 1

<keras.callbacks.History at 0x226baf4a350>

In [40]:
predictions = model.predict(Xtest)



In [41]:
predictions = np.round(predictions)

In [32]:
from sklearn.metrics import classification_report
print(classification_report(Y_test_hot, predictions))

              precision    recall  f1-score   support

           0       0.74      0.71      0.72        41
           1       0.86      0.88      0.87        83

   micro avg       0.82      0.82      0.82       124
   macro avg       0.80      0.79      0.80       124
weighted avg       0.82      0.82      0.82       124
 samples avg       0.82      0.82      0.82       124



In [33]:
import gensim
print(gensim.__version__)

4.2.0
