In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re
import gensim
from gensim.models import KeyedVectors
import numpy as np
from collections import Counter
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases
from nltk import sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim.downloader as api
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer
from preprocessing import *

In [13]:
w2v_pretrained = Word2Vec.load("w2vPreTrained")

In [2]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(subset = "verified_reviews", inplace = True)
print(dataset.shape)

(3150, 5)
(3150, 5)
(2998, 5)
(2196, 5)


In [3]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [4]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 412, 0: 206})


In [5]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [8]:
new_text, new_sent_tok = tokenize_list_of_text(X_temp, custom_stopwords, False, 2)

total number of types extracted is: 1798


In [9]:
cleaned_reviews = frequency_cleaning(new_sent_tok, 2)

In [10]:
bigrams = Phrases(cleaned_reviews, scoring="npmi", threshold=0.60) #estrae le collocazioni tramite PMI
bigrams[cleaned_reviews][0]

['like',
 'fact',
 'answer',
 'not_see',
 'real',
 'need',
 'household',
 'though',
 'good',
 'day',
 'deal']

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(bigrams[cleaned_reviews], y, test_size=0.20, random_state=10)

In [14]:
negative_reviews = []
for rev, label in zip(X_train, Y_train):
    if label == 0:
        negative_reviews.append(rev)
    
generated_reviews = generate_samples(negative_reviews, int(len(negative_reviews)/2), w2v_pretrained)

# run this only one time
X_train.extend(generated_reviews)
Y_train.extend([0 for x in generated_reviews])

['joke']
-
['laugh']
--------------------------
['price', 'product', 'nice', 'quality', 'nice', 'feature', 'definitely', 'reason', 'give', 'think', 'may', 'buyer', 'error', 'first', 'ignore', 'product', 'plug', 'time', 'work', 'really', 'unlike', 'not_a', 'stand', 'device', 'also', 'speaker', 'not_very', 'loud', 'buy', 'bluetooth', 'speaker', 'sure', 'lot', 'figure', 'use', 'kind', 'seem', 'like', 'device', 'plus', 'set', 'awful', 'would', 'definitely', 'money', 'buy', 'one', 'actually']
-
['cost', 'product', 'decent', 'quality', 'decent', 'feature', 'definitely', 'reason', 'present', 'imagine', 'may', 'buyer', 'error', 'start', 'ignore', 'product', 'plug', 'sentence', 'run', 'truly', 'different', 'not_a', 'base', 'device', 'also', 'speaker', 'not_very', 'loudly', 'purchase', 'bluetooth', 'speaker', 'sure', 'plenty', 'figure', 'utilize', 'kind', 'seem', 'wish', 'device', 'plus', 'adjust', 'terrible', 'would', 'definitely', 'money', 'purchase', 'one', 'really']
-------------------------

In [15]:
tfidf = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, min_df = 0)
tfidf_model = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
review = 0
for score, feature in zip(tfidf_model.toarray()[review], tfidf.get_feature_names_out()):
    if score > 0.0:
        print(feature, score)

box 0.2839771637431346
even_though 0.27762788039582936
far 0.21272579636892072
happy 0.5156101732954667
like 0.16405526505032533
look 0.22704895144905052
new 0.214553313857349
original 0.2668104048749314
perfectly 0.2994851270228816
purchase 0.20289746058223077
refurbish 0.2294451949681321
two 0.23734809025484485
week 0.27194825290277225
work 0.14728732380665022


In [16]:
w2v_model = Word2Vec(X_train, vector_size=100, window = 10, min_count = 0, sg=1, hs = 1, seed = 5, epochs=100)

In [19]:
w2v_model.wv.most_similar("love", topn = 10)

[('great', 0.5367417931556702),
 ('get', 0.49619752168655396),
 ('use', 0.4509217441082001),
 ('kitchen', 0.44389069080352783),
 ('cook', 0.43931204080581665),
 ('show', 0.4102255403995514),
 ('fun', 0.39854246377944946),
 ('like', 0.3985327482223511),
 ('recognition', 0.39414140582084656),
 ('trailer', 0.387723833322525)]

In [20]:
w2v_model.wv.most_similar("love", topn = 2000)[-10:]

[('free', -0.07402048259973526),
 ('sluggish', -0.07937180995941162),
 ('place', -0.0795297846198082),
 ('spend', -0.08012556284666061),
 ('mood', -0.08532152324914932),
 ('blue', -0.09613428264856339),
 ('laugh', -0.10050835460424423),
 ('reboot', -0.1030278429389),
 ('unplug', -0.10371576994657516),
 ('also', -0.12003546953201294)]

In [57]:
def review_vectors(tokens, size = 300, weights = [], pretrained = False, pretrained_embeddings = None):
    
    """Genera un vettore per ogni recensione: questo vettore
    è calcolato come la media ponderata (t * w: token vettore * peso tfidf) dei vettori dei token nella recensione.
    Il vettore risultante è normalizzato alla fine."""

    vec = np.zeros(size).reshape((1, size))
    count = 0

    for word, weight in zip(tokens, weights):
        try:
            if pretrained:
                vec += pretrained_embeddings[word] 
                count +=1
            else:
                vec += w2v_model.wv[word] * weight
                count +=1
        except KeyError:
            # print("non trovo", word)
            continue

    if count!= 0:
        vec = vec / norm(vec)
        
    return vec

In [58]:
w2v_X_train = np.zeros((len(X_train), w2v_model.vector_size))
for i in range(len(X_train)):
    w2v_X_train[i,:] = review_vectors(tfidf.inverse_transform(tfidf_model[i, :])[0], w2v_model.vector_size, tfidf_model[i,:].data, False)
w2v_X_train.shape

(570, 100)

In [59]:
w2v_X_test = np.zeros((len(X_test), w2v_model.vector_size))
for i in range(len(X_test)):
    w2v_X_test[i,:] = review_vectors(tfidf.inverse_transform(X_test_tf[i, :])[0], w2v_model.vector_size, X_test_tf[i,:].data, False)
w2v_X_test.shape

(124, 100)

In [60]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

# # Create the SMOTE object
# sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=10)

# # Apply SMOTE to the training data
# X_train_resampled, y_train_resampled = sm.fit_resample(w2v_X_train, Y_train)

# print('Resampled dataset shape %s' % Counter(y_train_resampled))

undersampler_nn = TomekLinks(sampling_strategy="all") # use tomeLinks to remove ambiguous data through nearest neighbours


# Fit and transform the X and y data
X_train_resampled, y_train_resampled = undersampler_nn.fit_resample(w2v_X_train, Y_train)

# X_train_resampled, y_train_resampled = w2v_X_train, Y_train


print('Resampled dataset shape %s' % Counter(y_train_resampled))

Resampled dataset shape Counter({1: 317, 0: 229})


In [61]:
from keras.layers import Flatten

In [62]:
from keras.utils import to_categorical
Y_train_hot = to_categorical(y_train_resampled)

In [63]:
Y_test_hot = to_categorical(Y_test)

In [64]:
from keras.callbacks import EarlyStopping

# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='binary_accuracy', patience=25)

# Fit the model with early stopping
model = Sequential()
model.add(Dense(300, activation='relu', input_dim=w2v_model.vector_size))
model.add(Dropout(0.2))
model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))

# Compile your model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

In [85]:
model.fit(X_train_resampled, Y_train_hot, epochs=100, batch_size=32, verbose=2, validation_split=0.1, callbacks = [early_stopping])

Epoch 1/100
16/16 - 0s - loss: 0.0226 - binary_accuracy: 0.9847 - val_loss: 1.2443e-04 - val_binary_accuracy: 1.0000 - 229ms/epoch - 14ms/step
Epoch 2/100
16/16 - 0s - loss: 0.0201 - binary_accuracy: 0.9847 - val_loss: 1.3524e-04 - val_binary_accuracy: 1.0000 - 160ms/epoch - 10ms/step
Epoch 3/100
16/16 - 0s - loss: 0.0149 - binary_accuracy: 0.9878 - val_loss: 5.5833e-05 - val_binary_accuracy: 1.0000 - 203ms/epoch - 13ms/step
Epoch 4/100
16/16 - 0s - loss: 0.0149 - binary_accuracy: 0.9898 - val_loss: 5.9827e-05 - val_binary_accuracy: 1.0000 - 148ms/epoch - 9ms/step
Epoch 5/100
16/16 - 0s - loss: 0.0150 - binary_accuracy: 0.9908 - val_loss: 7.9236e-05 - val_binary_accuracy: 1.0000 - 153ms/epoch - 10ms/step
Epoch 6/100
16/16 - 0s - loss: 0.0142 - binary_accuracy: 0.9929 - val_loss: 9.0047e-05 - val_binary_accuracy: 1.0000 - 168ms/epoch - 10ms/step
Epoch 7/100
16/16 - 0s - loss: 0.0143 - binary_accuracy: 0.9908 - val_loss: 9.2151e-05 - val_binary_accuracy: 1.0000 - 148ms/epoch - 9ms/step
E

<keras.callbacks.History at 0x158ec54ccd0>

In [86]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 300)               30300     
                                                                 
 dropout_9 (Dropout)         (None, 300)               0         
                                                                 
 dense_13 (Dense)            (None, 300)               90300     
                                                                 
 dropout_10 (Dropout)        (None, 300)               0         
                                                                 
 dense_14 (Dense)            (None, 100)               30100     
                                                                 
 dropout_11 (Dropout)        (None, 100)               0         
                                                                 
 dense_15 (Dense)            (None, 2)                

In [87]:
predictions = model.predict(w2v_X_test) 



In [88]:
predictions = np.round(predictions)

In [89]:
from sklearn.metrics import classification_report
print(classification_report(Y_test_hot, predictions))

              precision    recall  f1-score   support

           0       0.69      0.71      0.70        41
           1       0.85      0.84      0.85        83

   micro avg       0.80      0.80      0.80       124
   macro avg       0.77      0.78      0.77       124
weighted avg       0.80      0.80      0.80       124
 samples avg       0.80      0.80      0.80       124

