Dataset Loading

In [2]:
import pandas as pd

df = pd.read_json('aspen.json')

df

Unnamed: 0,stars,hotel,title,content
0,5,[Limelight Hotel],[Awesome visit ],[Went on a girls trip this past weekend. We ha...
1,5,[Limelight Hotel],[Super hotel and Super Staff],[We were very lucky to win 4 nights accomodati...
2,5,[Chateau Roaring Fork],[Wait until the last minute],[Wait until the last minute and get a lodging/...
3,5,[Limelight Hotel],[Great Hotel. Nice place to stay],[Great hotel. Beautiful. Great well decorated ...
4,3,[Aspen Mountain Lodge],[Quaint and cozy lodge],"[Great value, decent location. I'd highly reco..."
...,...,...,...,...
2001,5,[Chateau Blanc],[Great stay],[Spent a couple of nights in Aspen on a girls ...
2002,4,[Chateau Blanc],[Excellent cost/benefit],[We stayed in a two bedrooms/bathrooms apartme...
2003,5,[Chateau Blanc],[Great WInter Vaca],[A wonderful place to stay for our family vaca...
2004,5,[Chateau Blanc],[Chateau Blanc for a week],[The lodge is few blocks away from the main do...


Clean data and Pre-processing

In [3]:
import pandas as pd

# Assuming you have the DataFrame df with columns 'stars', 'hotel', 'title', and 'content'

# Define a function to categorize stars
def categorize_stars(stars):
    if stars in [4, 5]:
        return '1'
    elif stars in [1, 2, 3]:
        return '0'
    else:
        return 'Unknown'

# Read your DataFrame from the provided data
# df = pd.read_csv('your_data.csv')  # Uncomment and replace 'your_data.csv' with your file path if you're reading from a CSV file

# Apply the categorize_stars function to the 'stars' column and create a new column 'rating'
df['rating'] = df['stars'].apply(categorize_stars)
df

Unnamed: 0,stars,hotel,title,content,rating
0,5,[Limelight Hotel],[Awesome visit ],[Went on a girls trip this past weekend. We ha...,1
1,5,[Limelight Hotel],[Super hotel and Super Staff],[We were very lucky to win 4 nights accomodati...,1
2,5,[Chateau Roaring Fork],[Wait until the last minute],[Wait until the last minute and get a lodging/...,1
3,5,[Limelight Hotel],[Great Hotel. Nice place to stay],[Great hotel. Beautiful. Great well decorated ...,1
4,3,[Aspen Mountain Lodge],[Quaint and cozy lodge],"[Great value, decent location. I'd highly reco...",0
...,...,...,...,...,...
2001,5,[Chateau Blanc],[Great stay],[Spent a couple of nights in Aspen on a girls ...,1
2002,4,[Chateau Blanc],[Excellent cost/benefit],[We stayed in a two bedrooms/bathrooms apartme...,1
2003,5,[Chateau Blanc],[Great WInter Vaca],[A wonderful place to stay for our family vaca...,1
2004,5,[Chateau Blanc],[Chateau Blanc for a week],[The lodge is few blocks away from the main do...,1


In [4]:
import os
import sys
import numpy as np


#pre-processing of text
import string
import re


from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [5]:
def clean_text(str_list, lemmatize=True):
    clean_list = []
    
    for text in str_list:
        # Remove pound sign from hashtags
        text = re.sub(r'#', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        lemmatizer = WordNetLemmatizer()  # Move lemmatizer initialization outside the loop
        
        for word in words:
            # Drop words with fewer than 2 characters and drop any punctuation "words"
            if len(word) > 1 and re.match(r'^\w+$', word):
                if lemmatize:
                    word = lemmatizer.lemmatize(word)  # Apply lemmatization
                clean_words.append(word)
        
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [6]:
df['content'] = df['content'].astype(str)
df['clean_text']= clean_text(df['content'])
df['clean_text']= df['clean_text'].str.lower()
df

Unnamed: 0,stars,hotel,title,content,rating,clean_text
0,5,[Limelight Hotel],[Awesome visit ],['Went on a girls trip this past weekend. We h...,1,on girl trip this past weekend we had wonderfu...
1,5,[Limelight Hotel],[Super hotel and Super Staff],['We were very lucky to win 4 nights accomodat...,1,were very lucky to win night accomodation at t...
2,5,[Chateau Roaring Fork],[Wait until the last minute],"[""Wait until the last minute and get a lodging...",1,wait until the last minute and get ticket pack...
3,5,[Limelight Hotel],[Great Hotel. Nice place to stay],['Great hotel. Beautiful. Great well decorated...,1,hotel beautiful great well decorated bar and n...
4,3,[Aspen Mountain Lodge],[Quaint and cozy lodge],"[""Great value, decent location. I'd highly rec...",0,great value decent location highly recommend r...
...,...,...,...,...,...,...
2001,5,[Chateau Blanc],[Great stay],['Spent a couple of nights in Aspen on a girls...,1,couple of night in aspen on girl getaway our t...
2002,4,[Chateau Blanc],[Excellent cost/benefit],['We stayed in a two bedrooms/bathrooms apartm...,1,stayed in two apartment the apartment had pret...
2003,5,[Chateau Blanc],[Great WInter Vaca],['A wonderful place to stay for our family vac...,1,wonderful place to stay for our family vacatio...
2004,5,[Chateau Blanc],[Chateau Blanc for a week],"[""The lodge is few blocks away from the main d...",1,the lodge is few block away from the main down...


TF-IDF

In [7]:
#import feature extraction methods from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate

In [8]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
X = df['clean_text']
y = df['rating']  

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Training set size:", X_train_tfidf.shape[0])
print("Testing set size:", X_test_tfidf.shape[0])

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range=(1, 1))

Training set size: 1604
Testing set size: 402


LSTM-model

In [9]:
import os
import sys
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Pre-processing of text
import string
import re

import pandas as pd  # To work with CSV files

2024-05-03 16:02:19.267803: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
BASE_DIR = os.getcwd()
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')  # Download from: https://nlp.stanford.edu/projects/glove/

MAX_SEQUENCE_LENGTH = 500
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [11]:
# Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
# Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train_tfidf)
train_sequences = tokenizer.texts_to_sequences(X_train_tfidf)  # Converting text to a vector of word indices
test_sequences = tokenizer.texts_to_sequences(X_test_tfidf)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7294 unique tokens.


In [12]:
# Initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
trainvalid_labels = to_categorical(y_train, num_classes=2, dtype="int32")
test_labels = to_categorical(y_test, num_classes=2, dtype="int32")

In [13]:
# Split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)
trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])
x_train = trainvalid_data[:-num_validation_samples]
y_train = trainvalid_labels[:-num_validation_samples]
x_val = trainvalid_data[-num_validation_samples:]
y_val = trainvalid_labels[-num_validation_samples:]
# This is the data we will use for RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [18]:
print('Preparing embedding matrix.')
# First, build an index mapping words in the embeddings set to their embedding vector
embeddings_index = {}

with open(os.path.join(GLOVE_DIR, '/Users/pikaqiu/Desktop/AU /class /2024 spring/ Natural Lang Processing/project/classfication/glove.6B/glove.6B.100d.txt'), encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))
# print(embeddings_index["google"])

# Prepare an embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Load these pre-trained word embeddings into an Embedding layer
# Note that we set trainable = True to fine-tune the embeddings during training
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
print("Preparing the embedding matrix is done")

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.
Preparing the embedding matrix is done


In [19]:
print('Define an RNN model.')
# Define the mapping
labels_index = {'bad': 0, 'good': 1}

rnnmodel = Sequential()
rnnmodel.add(embedding_layer)
rnnmodel.add(LSTM(128, dropout=0.25, recurrent_dropout=0.25))  # Use LSTM layer
rnnmodel.add(Dense(len(labels_index), activation='softmax'))

rnnmodel.compile(loss='categorical_crossentropy',
                optimizer='Adam',
                metrics=['acc'])
rnnmodel.summary()

Define an RNN model.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          729500    
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                                 
Total params: 847006 (3.23 MB)
Trainable params: 847006 (3.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# Train the model. Tune to the validation set.
print(x_train.shape)
print(y_train.shape)
rnn_train = rnnmodel.fit(x_train, y_train,
                        batch_size=16,
                        epochs=10, verbose=1, validation_data=(x_val, y_val))

score_train, acc_train = rnnmodel.evaluate(x_train, y_train)
print('Training accuracy with RNN:', acc_train)

(1284, 500)
(1284, 2)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training accuracy with RNN: 0.9984423518180847


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

score_test, acc_test = rnnmodel.evaluate(test_data, test_labels)
print('Testing accuracy with RNN:', acc_test)
# predict test y in  RNN model
predicted_labels = rnnmodel.predict(test_data) 

# Convert one-hot encoded labels to class labels
true_classes = np.argmax(test_labels, axis=1)
predicted_classes = np.argmax(predicted_labels, axis=1)

# Calculate metrics
accuracy = accuracy_score(true_classes, predicted_classes)
precision = precision_score(true_classes, predicted_classes, average='weighted')
recall = recall_score(true_classes, predicted_classes, average='weighted')
f1 = f1_score(true_classes, predicted_classes, average='weighted')

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Testing accuracy with RNN: 0.8905472755432129
Accuracy: 0.8905472636815921
Precision: 0.8820934850105934
Recall: 0.8905472636815921
F1-score: 0.8853673741696987


In [25]:
print(classification_report(true_classes, predicted_classes))

              precision    recall  f1-score   support

           0       0.59      0.47      0.52        51
           1       0.93      0.95      0.94       351

    accuracy                           0.89       402
   macro avg       0.76      0.71      0.73       402
weighted avg       0.88      0.89      0.89       402

