In [3]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

In [4]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')
text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'
doc = nlp(text)

In [5]:
doc._.polarity

-0.125

In [6]:
doc._.subjectivity

0.9

In [7]:
text = "I'm very happy. I'm working on a beautiful project."
doc = nlp(text)

In [8]:
doc._.polarity

0.925

In [9]:
doc._.subjectivity

1.0

In [10]:
text = "This book is very simple and enjoyable to study."
doc = nlp(text)

In [11]:
doc._.polarity

0.25

In [12]:
doc._.subjectivity

0.5321428571428571

# CONFRONTIAMO IL NOSTRO MODELLO NAIVE BAYES CON SPACY

### - Caricare il dataset SMALL_IMDB.csv
### - Splittare il dataset in train e test
### - Pulire i dataset di train e test (salvare il dataset di test non pulitò, servirà con spacy)
### - Fare il training del modello Naive Bayes
### - Testare il modello sul dataset di test
### - Utilizzare la funzione polarity di spacy per calcolare la sentiment (scrivere opportuna funzione che associa al valore numerico le label positive e negative)
### - Scrivere una funzione per calcolare l'accuracy ottenuta dai risultati spacy

## Puliamo il dataset, addestriamo il modello e valutiamo l'accuracy

In [13]:
import pandas as pd

In [14]:
dataset = pd.read_csv('datasets/Lezione_5-sentiment_Analysis/SMALL_IMDB.csv')

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
data_train, data_test, y_train, y_test = train_test_split(dataset['review'], dataset['sentiment'], test_size=0.20, random_state=40)

In [18]:
import string
import spacy
from nltk.corpus import stopwords
import re

english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(sentence):
    sentence = sentence.lower()
    for c in string.punctuation:
        sentence = sentence.replace(c, " ")
    document = nlp(sentence)
    sentence = ' '.join(token.lemma_ for token in document)
    sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
    sentence = re.sub('\d', '', sentence)
    
    return sentence

In [19]:
data_train_cleaned = []
for i,r in enumerate(data_train):
    data_train_cleaned.append(data_cleaner(r))

In [20]:
data_test_cleaned = []
for r in data_test:
    data_test_cleaned.append(data_cleaner(r))

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
data_train_cleaned_vectorized = vec.fit_transform(data_train_cleaned).toarray()
data_test_cleaned_vectorized = vec.transform(data_test_cleaned).toarray()

In [22]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(data_train_cleaned_vectorized, y_train)

In [23]:
model.score(data_test_cleaned_vectorized, y_test)

0.833

## Valutiamo Spacy 

In [24]:
def sentiment_checker(value, label, text):
    if (value > 0 and label == 'positive') or (value < 0 and label == 'negative'):
        return 1
    else:
        #print(text)
        print("POLARITY SPACY: "+str(value))
        print("REAL LABEL: "+label)
        print("OUR SA MODEL: "+model.predict(vec.transform([data_cleaner(text)]))[0])
        print("----------------------------------------------------------------")
        return 0

In [25]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

count = 0
for r,s in zip(data_test,y_test):
    doc = nlp(r)
    count+=sentiment_checker(doc._.polarity,s,r)
    

POLARITY SPACY: 0.06350649350649351
REAL LABEL: negative
OUR SA MODEL: negative
----------------------------------------------------------------
POLARITY SPACY: 0.06478696741854637
REAL LABEL: negative
OUR SA MODEL: negative
----------------------------------------------------------------
POLARITY SPACY: 0.014194139194139189
REAL LABEL: negative
OUR SA MODEL: negative
----------------------------------------------------------------
POLARITY SPACY: 0.17006653069153066
REAL LABEL: negative
OUR SA MODEL: positive
----------------------------------------------------------------
POLARITY SPACY: 0.2435897435897436
REAL LABEL: negative
OUR SA MODEL: positive
----------------------------------------------------------------
POLARITY SPACY: 0.18091313448456306
REAL LABEL: negative
OUR SA MODEL: positive
----------------------------------------------------------------
POLARITY SPACY: 0.08333333333333336
REAL LABEL: negative
OUR SA MODEL: positive
--------------------------------------------------

In [26]:
count/len(y_test)

0.68