In [45]:
import pandas as pd
import nltk as nt
df = pd.read_csv('reviews/recensioni_tradotte.csv')

df2 = df.drop(columns=['Commento']).rename(columns={'Commento Tradotto': 'Commento'})

In [46]:
df2["review"] = df2["Commento"] 

df2["is_bad_review"] = df2["Valutazione"].apply(lambda x: 1 if x <= 5 else 0)  # Sostituisci 'soglia' con il tuo valore

df2 = df2[["review", "is_bad_review"]]
df2

Unnamed: 0,review,is_bad_review
0,We chose this hotel for proximity to Vatican C...,1
1,This property is very run down and dirty. The ...,1
2,It's very dirty! Toilet bowl doesn't work!,1
3,There was no AC or hot water in the room we ha...,1
4,This was a very disappointing hotel - run down...,1
...,...,...
133,"Gorgeous hotel, excellent location in the cent...",0
134,"Overall positive experience, comfortable hotel...",0
135,It wasn't five stars ...,1
136,Had nothing special quite disturbing,1


In [47]:
#questo script è progettato per pulire e preprocessare dati testuali, 
#rendendoli adatti per compiti di elaborazione del linguaggio naturale (NLP)
#come l'analisi del sentiment 

from nltk.corpus import wordnet  

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
df2["review_clean"] = df2["review"].apply(lambda x: clean_text(x))

In [48]:
df2

Unnamed: 0,review,is_bad_review,review_clean
0,We chose this hotel for proximity to Vatican C...,1,chose hotel proximity vatican city many amenit...
1,This property is very run down and dirty. The ...,1,property run dirty mirror room give look liked...
2,It's very dirty! Toilet bowl doesn't work!,1,dirty toilet bowl work
3,There was no AC or hot water in the room we ha...,1,ac hot water room discover hot water day check...
4,This was a very disappointing hotel - run down...,1,disappointing hotel run smelly room roof deck ...
...,...,...,...
133,"Gorgeous hotel, excellent location in the cent...",0,gorgeous hotel excellent location center see b...
134,"Overall positive experience, comfortable hotel...",0,overall positive experience comfortable hotel ...
135,It wasn't five stars ...,1,five star
136,Had nothing special quite disturbing,1,nothing special quite disturb


In [49]:
#sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df2["sentiments"] = df2["review_clean"].apply(lambda x: sid.polarity_scores(x))
df2 = pd.concat([df2.drop(['sentiments'], axis=1), df2['sentiments'].apply(pd.Series)], axis=1)

In [50]:
df2

Unnamed: 0,review,is_bad_review,review_clean,neg,neu,pos,compound
0,We chose this hotel for proximity to Vatican C...,1,chose hotel proximity vatican city many amenit...,0.064,0.888,0.048,0.0772
1,This property is very run down and dirty. The ...,1,property run dirty mirror room give look liked...,0.131,0.713,0.155,0.2363
2,It's very dirty! Toilet bowl doesn't work!,1,dirty toilet bowl work,0.492,0.508,0.000,-0.4404
3,There was no AC or hot water in the room we ha...,1,ac hot water room discover hot water day check...,0.029,0.849,0.122,0.5478
4,This was a very disappointing hotel - run down...,1,disappointing hotel run smelly room roof deck ...,0.167,0.833,0.000,-0.4939
...,...,...,...,...,...,...,...
133,"Gorgeous hotel, excellent location in the cent...",0,gorgeous hotel excellent location center see b...,0.000,0.376,0.624,0.9118
134,"Overall positive experience, comfortable hotel...",0,overall positive experience comfortable hotel ...,0.000,0.702,0.298,0.9325
135,It wasn't five stars ...,1,five star,0.000,1.000,0.000,0.0000
136,Had nothing special quite disturbing,1,nothing special quite disturb,0.335,0.297,0.368,0.0559


In [51]:
#count num chars and words for each doc
df2["num_chars"] = df2["review"].apply(lambda x: len(x))

df2["num_words"] = df2["review"].apply(lambda x: len(x.split(" ")))

In [52]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=10)
tfidf_result = tfidf.fit_transform(df2["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names_out())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = df2.index
reviews_df = pd.concat([df2, tfidf_df], axis=1)

In [53]:
reviews_df.head()

Unnamed: 0,review,is_bad_review,review_clean,neg,neu,pos,compound,num_chars,num_words,word_altro,...,word_restaurant,word_room,word_show,word_staff,word_star,word_stay,word_structure,word_the,word_well,word_work
0,We chose this hotel for proximity to Vatican C...,1,chose hotel proximity vatican city many amenit...,0.064,0.888,0.048,0.0772,417,76,0.304594,...,0.333874,0.155205,0.0,0.0,0.0,0.0,0.0,0.0,0.304594,0.0
1,This property is very run down and dirty. The ...,1,property run dirty mirror room give look liked...,0.131,0.713,0.155,0.2363,417,82,0.423274,...,0.0,0.431356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,It's very dirty! Toilet bowl doesn't work!,1,dirty toilet bowl work,0.492,0.508,0.0,-0.4404,42,7,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,There was no AC or hot water in the room we ha...,1,ac hot water room discover hot water day check...,0.029,0.849,0.122,0.5478,417,80,0.424927,...,0.0,0.64956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,This was a very disappointing hotel - run down...,1,disappointing hotel run smelly room roof deck ...,0.167,0.833,0.0,-0.4939,171,37,0.0,...,0.0,0.71697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
reviews_df.shape

(138, 35)

In [55]:
# distrubuzione delle recensioni 
reviews_df["is_bad_review"].value_counts(normalize = True)

is_bad_review
0    0.681159
1    0.318841
Name: proportion, dtype: float64

In [56]:
#matrice di co occurence
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

reviews = reviews_df['review_clean'].tolist()
vectorizer = CountVectorizer()
co_occurrence_matrix = vectorizer.fit_transform(reviews).toarray()

vocab = vectorizer.get_feature_names_out()

co_occurrence_df = pd.DataFrame(co_occurrence_matrix, columns=vocab)

file_path = "vocab.txt"
with open(file_path, "w", encoding="utf-8") as file:
    for word in vocab:
        file.write(word + "\n")
co_occurrence_df

Unnamed: 0,able,absence,absolutely,abundant,ac,acceptable,access,accessible,accessory,accommodation,...,wouldn,wow,write,wrong,yard,year,years,yet,you,young
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
import gensim.downloader

glove_vectors = gensim.downloader.load('glove-twitter-200')

with open("vocab.txt", "r", encoding="utf-8") as file:
    word_list = [line.strip() for line in file]

word_list = [word for word in word_list if word in glove_vectors]

word_vectors = {word: glove_vectors[word] for word in word_list}



In [70]:
#similarità tra due parole
from sklearn.metrics.pairwise import cosine_similarity
word1 = "dirty"
word2 = "clean"
if word1 in word_vectors and word2 in word_vectors:
    similarity_score = cosine_similarity([word_vectors[word1]], [word_vectors[word2]])[0][0]
    print(f"Similarità tra '{word1}' e '{word2}': {similarity_score}")
else:
    print(f"Una delle parole non è nel vocabolario.")


Similarità tra 'dirty' e 'clean': 0.6474672555923462


In [59]:
#parole simili alla target word
target_word = "young"
if target_word in word_vectors:
    similarities = cosine_similarity([word_vectors[target_word]], list(word_vectors.values()))
    most_similar_indices = similarities.argsort()[0][::-1][1:10]  
    # escludi la stessa parola e prendi le prime 10 simili
    most_similar_words = [list(word_vectors.keys())[i] for i in most_similar_indices]

    print(f"Parole più simili a '{target_word}': {most_similar_words}")
else:
    print(f"'{target_word}' non è nel vocabolario.")

Parole più simili a 'young': ['old', 'child', 'little', 'too', 'still', 'big', 'good', 'know', 'think']


In [60]:
if 'beautiful' in word_vectors  and 'modern' in word_vectors:
    vector_arithmetic_result = word_vectors['beautiful'] + word_vectors['modern']

    similarities = cosine_similarity([vector_arithmetic_result], list(word_vectors.values()))
    most_similar_indices = similarities.argsort()[0][::-1][:5]  
    most_similar_words = [list(word_vectors.keys())[i] for i in most_similar_indices]

    print(f"Risultato di 'beautiful  + modern': {most_similar_words}")
else:
    print("Almeno una delle parole non è presente nei word vectors.")


Risultato di 'beautiful  + modern': ['beautiful', 'modern', 'amazing', 'great', 'gorgeous']


In [75]:
if 'young' in word_vectors  and 'modern' in word_vectors:
    vector_arithmetic_result = word_vectors['modern'] + word_vectors['young']

    similarities = cosine_similarity([vector_arithmetic_result], list(word_vectors.values()))
    most_similar_indices = similarities.argsort()[0][::-1][:5]  # prendi le prime 5 simili
    most_similar_words = [list(word_vectors.keys())[i] for i in most_similar_indices]

    print(f"Risultato di 'modern  + young': {most_similar_words}")
else:
    print("Almeno una delle parole non è presente nei word vectors.")

Risultato di 'modern  + young': ['young', 'modern', 'old', 'living', 'the']
