In [46]:
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from mi_helper import *

#fare custom vectorizer

In [47]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
dataset.head(2)
data_test = pd.read_csv("test_set.csv", sep = "\t")
data_test.drop("Unnamed: 0", axis = 1, inplace = True)
dataset = pd.concat([data_test, dataset])
dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset["feedback"].value_counts())
dataset.info()

1    2095
0     452
Name: feedback, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2547 entries, 2 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   variation         2547 non-null   object
 1   verified_reviews  2547 non-null   object
 2   rating            2547 non-null   int64 
 3   date              2547 non-null   object
 4   feedback          2547 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 119.4+ KB


In [48]:
tokenizer = nltk.tokenize.TweetTokenizer()
lemmatizer = nltk.WordNetLemmatizer()
tokenized_reviews = []
for review in dataset["verified_reviews"]:
    clean_text = ""
    tokens = tokenizer.tokenize(review)
    clean_tokens = [w.lower() for w in tokens if w not in string.punctuation and len(w)>2]
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in clean_tokens]
    for t in lemmatized_tokens:
        clean_text += " " + t
    tokenized_reviews.append(clean_text)

dataset["verified_reviews"] = tokenized_reviews

In [49]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# 
# reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
# reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
# reviews_1.sort(key=len, reverse = True)
# sample_1 = reviews_1[:len(reviews_0)]
# verified_reviews_sample = []
# feedback_sample = []
# verified_reviews_sample.extend(sample_1)
# verified_reviews_sample.extend(reviews_0)
# feedback_sample.extend([1 for i in range(len(sample_1))])
# feedback_sample.extend([0 for i in range(len(reviews_0))])


# sample_dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
# sample_dataset

In [50]:
##################################################################
# CALCOLO VETTORI COL MIO METODO
X_train, X_test, Y_train, Y_test = train_test_split(dataset["verified_reviews"], dataset["feedback"], test_size = 0.20, random_state=10)

data_dict = {0:{"text":""}, 1:{"text":""}}
for rev, feedback in zip(X_train, Y_train):
    data_dict[feedback]["text"]+= " " + rev

w_vect, bi_vect = vectors_creator(data_dict, normalize=False)
# print({k: v for k, v in sorted(w_vect[0].items(), key=lambda item: item[1], reverse=True)})
predicted = predict(X_test, w_vect, bi_vect, True, False)
print(metrics.classification_report(Y_test, predicted))
#provare ad aggiungere un peso per la lunghezza della recensione
#droppare le recensioni con 3 stelle

              precision    recall  f1-score   support

           0       0.60      0.78      0.68        95
           1       0.95      0.88      0.91       415

    accuracy                           0.86       510
   macro avg       0.77      0.83      0.80       510
weighted avg       0.88      0.86      0.87       510



In [51]:
#####################################################################
# COUNT VECTOR - MULTI NOMIAL NAIVE BAYES
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = cv.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
predicted = MNB.predict(X_test)
print(metrics.classification_report(Y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.74      0.83        95
           1       0.94      0.99      0.97       415

    accuracy                           0.94       510
   macro avg       0.94      0.86      0.90       510
weighted avg       0.94      0.94      0.94       510



In [52]:
#####################################################################
# TFIDF - MULTI NOMIAL NAIVE BAYES
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = tfidf.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
predicted = MNB.predict(X_test)
print(metrics.classification_report(Y_test, predicted))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        95
           1       0.81      1.00      0.90       415

    accuracy                           0.81       510
   macro avg       0.41      0.50      0.45       510
weighted avg       0.66      0.81      0.73       510



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
#####################################################################
# TFIDF - SVM (support vector machine)
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = tfidf.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
svc_tfid = LinearSVC(class_weight = "balanced")
svc_tfid.fit(X_train, Y_train)
pred = svc_tfid.predict(X_test)
print(metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.90      0.78      0.84        95
           1       0.95      0.98      0.97       415

    accuracy                           0.94       510
   macro avg       0.93      0.88      0.90       510
weighted avg       0.94      0.94      0.94       510



In [55]:
#####################################################################
# COUNT VECTOR - SVM (support vector machine)
cv = CountVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = cv.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
cv_tfid = LinearSVC(class_weight = "balanced")
cv_tfid.fit(X_train, Y_train)
pred = cv_tfid.predict(X_test)
print(metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.91      0.63      0.75        95
           1       0.92      0.99      0.95       415

    accuracy                           0.92       510
   macro avg       0.92      0.81      0.85       510
weighted avg       0.92      0.92      0.91       510





In [56]:
vector = tfidf.transform(["bad product"])
predicted = svc_tfid.predict(vector)
print(predicted)

[0]


In [59]:
predicted = predict(["my overall experience is a mess"], w_vect,bi_vect,True, False)
print(predicted)

[0]
