In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')

from mi_helper import *

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

from numpy import array 

In [3]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
# data_test = pd.read_csv("test_set.csv", sep = "\t")
# data_test.drop("Unnamed: 0", axis = 1, inplace = True)
# dataset = pd.concat([data_test, dataset])
# dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3-4 stelle
dataset.drop(dataset[dataset.rating == 4].index, inplace=True) #droppa recensioni con 3-4 stelle
print(dataset["feedback"].value_counts())
dataset.info()

1    2286
0     257
Name: feedback, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2543 entries, 0 to 3148
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            2543 non-null   int64 
 1   date              2543 non-null   object
 2   variation         2543 non-null   object
 3   verified_reviews  2543 non-null   object
 4   feedback          2543 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 119.2+ KB


In [6]:
# levare i numeri

def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
    """
    if treebank_tag.startswith('J'):
        return "a"
    elif treebank_tag.startswith('V'):
        return "v"
    elif treebank_tag.startswith('N'):
        return "n"
    elif treebank_tag.startswith('R'):
        return "r"
    else:
        return "n"
        
custom_stopwords = ["echo", "alexa", "dot", "amzon", "prime", "2nd", "generation", "1st", "3rd", "4th", "5th"]
pos_list = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

def tokenize_list_of_text(list_of_text, custom_stopwords = [], pos_filter = False, pos_list = []):
    """Tokenizza tutte le recensioni, pulisce da stopwords, elimina token <= 2 caratteri e lemmatizza. Ritorna la colonna delle recensioni."""

    tokenizer = nltk.tokenize.TweetTokenizer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokenized_reviews = []
    sent_tokenized_reviews = []
    for review in list_of_text: #pulisce le recensioni
        clean_text = ""
        tokens = tokenizer.tokenize(review)
        tokens = [w.lower() for w in tokens]
        tokens_pos = pos_tag(tokens)
        if pos_filter:
            clean_tokens = [(w, get_wordnet_pos(pos)) for w, pos in tokens_pos if w not in string.punctuation and len(w)>2 and w not in stopwords.words("english") and w not in custom_stopwords and pos in pos_list]
        else:
            clean_tokens = [(w, get_wordnet_pos(pos)) for w, pos in tokens_pos if w not in string.punctuation and len(w)>2 and w not in stopwords.words("english") and w not in custom_stopwords]
        lemmatized_tokens = [lemmatizer.lemmatize(w, pos) for w, pos in clean_tokens]
        sent_tokenized_reviews.append(lemmatized_tokens)
        for t in lemmatized_tokens:
            clean_text += " " + t
        tokenized_reviews.append(clean_text)

    return tokenized_reviews,  sent_tokenized_reviews # ritorna una tupla!

In [16]:
tokenized_reviews, sentences = tokenize_list_of_text(dataset["verified_reviews"], custom_stopwords)

dataset["verified_reviews"] = tokenized_reviews

In [230]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# prende le prime n recensioni positive di lunghezza maggiore, dove n è il numero di recensioni negative
def create_balanced_dataset(dataset):
    """Bilancia il dataset uniformando il numero di recensioni negative e positive. Prende in input il dataset"""
    reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
    reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
    reviews_1.sort(key=len, reverse = True)
    sample_1 = reviews_1[:len(reviews_0)]
    verified_reviews_sample = []
    feedback_sample = []
    verified_reviews_sample.extend(sample_1)
    verified_reviews_sample.extend(reviews_0)
    feedback_sample.extend([1 for i in range(len(sample_1))])
    feedback_sample.extend([0 for i in range(len(reviews_0))])
    dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
    print(dataset["feedback"].value_counts())
    return dataset


dataset = create_balanced_dataset(dataset)

1    257
0    257
Name: feedback, dtype: int64


In [17]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(dataset["verified_reviews"]) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10) #divisione in train e test

In [18]:
text_counts

<2543x9082 sparse matrix of type '<class 'numpy.int64'>'
	with 40104 stored elements in Compressed Sparse Row format>

In [19]:
len(cv.vocabulary_)

9082

In [20]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'love': 4443,
 'lot': 4404,
 'fun': 2654,
 'thing': 7800,
 'old': 5347,
 'learns': 3975,
 'control': 1450,
 'light': 4006,
 'play': 5632,
 'game': 2734,
 'like': 4051,
 'nice': 5247,
 'sound': 7172,
 'music': 4937,
 'lot fun': 4412,
 'control light': 1459,
 'play game': 5649,
 'game like': 2737,
 'nice sound': 5271,
 'sound play': 7228,
 'play music': 5664,
 'play game like': 5650,
 'sound play music': 7229,
 'receive': 6322,
 'gift': 2769,
 'need': 5132,
 'bluetooth': 846,
 'easily': 1999,
 'accessible': 135,
 'smart': 7064,
 'speaker': 7303,
 'wait': 8592,
 'receive gift': 6323,
 'gift need': 2779,
 'need bluetooth': 5137,
 'bluetooth play': 854,
 'music easily': 4983,
 'easily accessible': 2000,
 'accessible smart': 136,
 'smart speaker': 7103,
 'speaker wait': 7393,
 'receive gift need': 6324,
 'gift need bluetooth': 2780,
 'need bluetooth play': 5138,
 'bluetooth play music': 855,
 'play music easily': 5673,
 'music easily accessible': 4984,
 'easily accessible smart': 2001,
 'ac

In [21]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

9082

In [22]:
X_train[0,:] # prima recensione

<1x9082 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [23]:
print(X_train[0,:])

  (0, 7800)	2
  (0, 6020)	1
  (0, 4227)	1
  (0, 2801)	1
  (0, 2572)	1
  (0, 921)	1
  (0, 4208)	1
  (0, 2573)	1
  (0, 619)	1
  (0, 8624)	1
  (0, 1716)	1
  (0, 3082)	1
  (0, 3083)	1
  (0, 441)	1
  (0, 9030)	1
  (0, 1730)	1
  (0, 4376)	1
  (0, 1344)	1
  (0, 7810)	1
  (0, 8642)	1
  (0, 5461)	1
  (0, 2598)	1
  (0, 2816)	1


In [24]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['thing', 'purchase', 'listen', 'good', 'flash', 'briefing', 'list',
        'flash briefing', 'awesome', 'want', 'definitely', 'grocery',
        'grocery list', 'app', 'worth', 'definitely worth', 'longer',
        'complaint', 'thing awesome', 'want listen', 'pair', 'forever',
        'good complaint'], dtype='<U34')]

In [25]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

thing 2
purchase 1
listen 1
good 1
flash 1
briefing 1
list 1
flash briefing 1
awesome 1
want 1
definitely 1
grocery 1
grocery list 1
app 1
worth 1
definitely worth 1
longer 1
complaint 1
thing awesome 1
want listen 1
pair 1
forever 1
good complaint 1


In [26]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=5000)  # feature selection
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [27]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

5000

In [28]:
X_train_sel

<2034x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 24191 stored elements in Compressed Sparse Row format>

In [29]:
print(X_test_sel[0,:])

  (0, 1611)	1
  (0, 889)	1
  (0, 649)	1
  (0, 0)	2
  (0, 455)	1
  (0, 925)	1
  (0, 1675)	1
  (0, 3639)	1
  (0, 652)	1
  (0, 1678)	1
  (0, 926)	1


In [30]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['app', 'awesome', 'briefing', 'complaint', 'definitely',
       'definitely worth', 'flash briefing', 'forever', 'good', 'grocery',
       'grocery list', 'list', 'listen', 'longer', 'pair', 'purchase',
       'thing', 'thing awesome', 'want', 'want listen'], dtype='<U34')]


In [31]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [32]:
print(X_train_vec[0,:])

  (0, 4666)	0.28276722176670854
  (0, 4655)	0.16073552986635484
  (0, 4082)	0.2567022328097468
  (0, 4076)	0.2627971597082392
  (0, 2819)	0.14766660216121175
  (0, 2385)	0.2567022328097468
  (0, 1565)	0.24145519168821314
  (0, 1468)	0.15701854787392402
  (0, 1458)	0.1914745890690606
  (0, 938)	0.24588428497431872
  (0, 937)	0.24588428497431872
  (0, 856)	0.1305137430224195
  (0, 808)	0.2719492739312805
  (0, 794)	0.2567022328097468
  (0, 555)	0.24588428497431872
  (0, 549)	0.20457225489582329
  (0, 444)	0.21753957802602236
  (0, 299)	0.2374932327361439
  (0, 204)	0.17478106845065883
  (0, 131)	0.1937543070603952


In [33]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['app', 'awesome', 'briefing', 'complaint', 'definitely',
        'definitely worth', 'flash briefing', 'forever', 'good', 'grocery',
        'grocery list', 'list', 'listen', 'longer', 'pair', 'purchase',
        'thing', 'thing awesome', 'want', 'want listen'], dtype='<U34')]

In [34]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[1,:]))[0],X_train_vec[1,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

... 0.2637 1
love 0.341 1
love song 0.3161 1
news 0.4662 1
notification 0.3444 1
screen 0.5254 1
song 0.1476 1
time 0.284 1


In [35]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [36]:
scores_df.sort_values(by = "tf_idf_score", ascending = True).tail(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
9254,love,1.0,1
24032,need,1.0,1
5513,love,1.0,1
198,love,1.0,1
5495,new,1.0,1
9480,love,1.0,1
5302,love,1.0,1
5217,love,1.0,1
5214,like,1.0,1
5187,love,1.0,1


In [37]:
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [38]:
len(predictions), sum(predictions)

(509, 477)

In [39]:
predictions

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [40]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.66      0.48      0.55        44
           1       0.95      0.98      0.96       465

    accuracy                           0.93       509
   macro avg       0.80      0.73      0.76       509
weighted avg       0.93      0.93      0.93       509



In [41]:
feature_names = cv.get_feature_names_out()
feature_names

array(['. .', '...', '... amazing', ..., 'zero', 'zigbee', 'zigbee hub'],
      dtype=object)

In [42]:
print(select.scores_)

[ 0.1169687  33.205122    0.1169687  ... 10.10471727 10.10471727
  3.33313224]


In [43]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

9082

In [44]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(107.63000117674746, True, 'device'),
 (35.530315335174144, True, 'connection'),
 (35.005524339843284, True, 'ask'),
 (34.197183098591545, True, 'auxiliary jack'),
 (34.197183098591545, True, 'auxiliary'),
 (34.197183098591545, True, 'attempt'),
 (33.20512200053699, True, '...'),
 (25.64788732394366, True, 'awful'),
 (25.64788732394366, True, 'apple music'),
 (25.64788732394366, True, 'advertising')]

In [45]:
type(svm)

sklearn.svm._classes.LinearSVC

In [46]:
svm.coef_

array([[ 0.08108568,  0.2134921 , -0.25679056, ..., -0.13421372,
        -0.15026879, -0.11282501]])

In [47]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

4524

In [48]:
feats_w_classifier_weight[-100:] #features positive

[(0.35401947700140124, 'schedule'),
 (0.3542998021340719, 'convenient'),
 (0.3554540855863375, 'product want'),
 (0.3555948735894896, 'like far'),
 (0.35722661538254363, 'work need'),
 (0.3585490768342656, 'daughter'),
 (0.35876874150533244, 'privacy'),
 (0.3624972375708101, 'screen nice'),
 (0.36604536484237954, 'service'),
 (0.36797743852988113, 'easy setup'),
 (0.3716071873772213, 'tell'),
 (0.37246854840885396, 'ring'),
 (0.3729911740145367, 'dad'),
 (0.3742495028060518, 'home device'),
 (0.3749931241031757, 'right'),
 (0.3770037670519177, 'thanks'),
 (0.3778177828304608, 'television'),
 (0.37781943217059827, 'pleased'),
 (0.3787172827270539, 'add'),
 (0.38047609802726, 'entertainment'),
 (0.38170258559420484, 'young'),
 (0.3817518458697581, 'described'),
 (0.3821900026740203, 'new'),
 (0.3842638655088755, 'alarm clock'),
 (0.3900316352547312, 'work expect'),
 (0.39165176676775143, 'information'),
 (0.39284710431704706, 'want bedroom'),
 (0.39380491183193905, 'know refurbish'),
 (0

In [49]:
feats_w_classifier_weight[:100] #features negative

[(-1.6731301337553217, 'return'),
 (-1.3987575624757391, 'send'),
 (-1.3666985578939603, 'useless'),
 (-1.3246545766690625, 'adapter'),
 (-1.3220432466673588, 'poor'),
 (-1.317031121173544, 'dont'),
 (-1.2976038832532963, 'terrible'),
 (-1.2672497033063554, 'disconnect'),
 (-1.2368071112836916, 'dumb'),
 (-1.2308263268100141, 'disappointed'),
 (-1.1783466525781168, 'month'),
 (-1.1331618499878005, 'stop'),
 (-1.1232163648126834, 'really work'),
 (-1.1054947378108715, 'slow'),
 (-1.0863427343836969, 'meh'),
 (-1.0679847423695452, 'half'),
 (-1.0676457997920215, 'work time'),
 (-1.0546311508606845, 'español'),
 (-1.0310323487610682, 'feature'),
 (-1.0266930470613473, 'figure use'),
 (-1.0134134000140929, 'fan'),
 (-1.0095588150975565, 'suck'),
 (-1.0063735037112054, 'realize'),
 (-0.9941150968589214, 'youtube'),
 (-0.9813249571796747, 'volume'),
 (-0.9780237990432026, 'try'),
 (-0.9561821813156264, 'honestly'),
 (-0.9529623476260968, 'mode'),
 (-0.9443939734021856, 'turn'),
 (-0.94312930

In [50]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores").head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
0,-1.67313,return
1,-1.398758,send
2,-1.366699,useless
3,-1.324655,adapter
4,-1.322043,poor
5,-1.317031,dont
6,-1.297604,terrible
7,-1.26725,disconnect
8,-1.236807,dumb
9,-1.230826,disappointed


In [52]:
stringa = "the product is disappointing, audio sounds bad"
clean = tokenize_list_of_text([stringa])[0]
clean

[' product disappointing audio sound bad']

In [53]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [54]:
MNB = MultinomialNB()  # linear svm with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.07      1.00      0.13         3
           1       1.00      0.92      0.96       506

    accuracy                           0.92       509
   macro avg       0.53      0.96      0.54       509
weighted avg       0.99      0.92      0.95       509

