In [7]:
import pandas as pd

df = pd.read_json('aspen.json')

In [8]:
import os
import sys
import numpy as np


#pre-processing of text
import string
import re


from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [9]:
def clean_text(str_list, lemmatize=True):
    clean_list = []
    
    for text in str_list:
        # Remove pound sign from hashtags
        text = re.sub(r'#', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        lemmatizer = WordNetLemmatizer()  # Move lemmatizer initialization outside the loop
        
        for word in words:
            # Drop words with fewer than 2 characters and drop any punctuation "words"
            if len(word) > 1 and re.match(r'^\w+$', word):
                if lemmatize:
                    word = lemmatizer.lemmatize(word)  # Apply lemmatization
                clean_words.append(word)
        
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [10]:
df['content'] = df['content'].astype(str)
df['clean_text']= clean_text(df['content'])
df['clean_text']= df['clean_text'].str.lower()
df

Unnamed: 0,stars,hotel,title,content,clean_text
0,5,[Limelight Hotel],[Awesome visit ],['Went on a girls trip this past weekend. We h...,on girl trip this past weekend we had wonderfu...
1,5,[Limelight Hotel],[Super hotel and Super Staff],['We were very lucky to win 4 nights accomodat...,were very lucky to win night accomodation at t...
2,5,[Chateau Roaring Fork],[Wait until the last minute],"[""Wait until the last minute and get a lodging...",wait until the last minute and get ticket pack...
3,5,[Limelight Hotel],[Great Hotel. Nice place to stay],['Great hotel. Beautiful. Great well decorated...,hotel beautiful great well decorated bar and n...
4,3,[Aspen Mountain Lodge],[Quaint and cozy lodge],"[""Great value, decent location. I'd highly rec...",great value decent location highly recommend r...
...,...,...,...,...,...
2001,5,[Chateau Blanc],[Great stay],['Spent a couple of nights in Aspen on a girls...,couple of night in aspen on girl getaway our t...
2002,4,[Chateau Blanc],[Excellent cost/benefit],['We stayed in a two bedrooms/bathrooms apartm...,stayed in two apartment the apartment had pret...
2003,5,[Chateau Blanc],[Great WInter Vaca],['A wonderful place to stay for our family vac...,wonderful place to stay for our family vacatio...
2004,5,[Chateau Blanc],[Chateau Blanc for a week],"[""The lodge is few blocks away from the main d...",the lodge is few block away from the main down...


In [11]:
import pandas as pd

# Assuming you have the DataFrame df with columns 'stars', 'hotel', 'title', and 'content'

# Define a function to categorize stars
def categorize_stars(stars):
    if stars in [3,4, 5]:
        return 1
    elif stars in [1, 2]:
        return 0
    else:
        return 'Unknown'

# Read your DataFrame from the provided data
# df = pd.read_csv('your_data.csv')  # Uncomment and replace 'your_data.csv' with your file path if you're reading from a CSV file

# Apply the categorize_stars function to the 'stars' column and create a new column 'rating'
df['rating'] = df['stars'].apply(categorize_stars)
df

Unnamed: 0,stars,hotel,title,content,clean_text,rating
0,5,[Limelight Hotel],[Awesome visit ],['Went on a girls trip this past weekend. We h...,on girl trip this past weekend we had wonderfu...,1
1,5,[Limelight Hotel],[Super hotel and Super Staff],['We were very lucky to win 4 nights accomodat...,were very lucky to win night accomodation at t...,1
2,5,[Chateau Roaring Fork],[Wait until the last minute],"[""Wait until the last minute and get a lodging...",wait until the last minute and get ticket pack...,1
3,5,[Limelight Hotel],[Great Hotel. Nice place to stay],['Great hotel. Beautiful. Great well decorated...,hotel beautiful great well decorated bar and n...,1
4,3,[Aspen Mountain Lodge],[Quaint and cozy lodge],"[""Great value, decent location. I'd highly rec...",great value decent location highly recommend r...,1
...,...,...,...,...,...,...
2001,5,[Chateau Blanc],[Great stay],['Spent a couple of nights in Aspen on a girls...,couple of night in aspen on girl getaway our t...,1
2002,4,[Chateau Blanc],[Excellent cost/benefit],['We stayed in a two bedrooms/bathrooms apartm...,stayed in two apartment the apartment had pret...,1
2003,5,[Chateau Blanc],[Great WInter Vaca],['A wonderful place to stay for our family vac...,wonderful place to stay for our family vacatio...,1
2004,5,[Chateau Blanc],[Chateau Blanc for a week],"[""The lodge is few blocks away from the main d...",the lodge is few block away from the main down...,1


###        hotel "Positive review" or "Negative review"

In [45]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense

# Assuming your dataset is loaded into a pandas DataFrame named df

# Splitting the dataset into features (X) and labels (y)
X = df['clean_text'].values
y = df['rating'].values

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# Pad sequences
max_length = 100
X_pad = pad_sequences(X_seq, maxlen=max_length, padding='post')

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Define LSTM model
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=32))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)

# Evaluation
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation Accuracy: 0.9104477763175964


In [46]:
# Summarization function
def generate_summary(review_text):
    review_seq = tokenizer.texts_to_sequences([review_text])
    review_pad = pad_sequences(review_seq, maxlen=max_length, padding='post')
    prediction = model.predict(review_pad)
    if prediction > 0.5:
        return "Positive review"
    else:
        return "Negative review"

In [47]:
# Function to print hotel summary
def print_hotel_summary(hotel_name):
    hotel_reviews = df[df['hotel'].apply(lambda x: hotel_name in x)]
    if len(hotel_reviews) == 0:
        print(f"No reviews found for hotel '{hotel_name}'.")
        return
    hotel_reviews_text = hotel_reviews['clean_text'].values.tolist()
    for review_text in hotel_reviews_text:
        summary = generate_summary(review_text)
        print(f"Summary for hotel '{hotel_name}': {summary}")

# Example usage:
print_hotel_summary("Limelight Hotel")

Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive review
Summary for hotel 'Limelight Hotel': Positive 

### hotel summary

In [72]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense

# Function to generate summaries
def generate_summary(review_text):
    review_seq = tokenizer.texts_to_sequences([review_text])
    review_pad = pad_sequences(review_seq, maxlen=max_length, padding='post')
    summary_seq = model.predict_classes(review_pad)
    summary = tokenizer.sequences_to_texts([summary_seq])[0]
    return summary

# Function to print hotel summary
def print_hotel_summary(hotel_name):
    hotel_reviews = df[df['hotel'].apply(lambda x: hotel_name in x)]
    if len(hotel_reviews) == 0:
        print(f"No reviews found for hotel '{hotel_name}'.")
        return
    hotel_reviews_text = hotel_reviews['clean_text'].values.tolist()
    for review_text in hotel_reviews_text:
        summary = generate_summary(review_text)
        print(f"Summary for hotel '{hotel_name}': {summary}")

# Load the dataset
# Assuming df is already loaded and processed

# Preprocess the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
max_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Prepare the data for training
X_train, X_val, y_train, y_val = train_test_split(X, df['hotel'], test_size=0.2, random_state=42)

# Define LSTM model
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=128, return_sequences=True))
model.add(LSTM(units=64))
model.add(Dense(units=len(tokenizer.word_index)+1, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, batch_size=32)

# Example usage of print_hotel_summary
print_hotel_summary("Limelight Hotel")


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [80]:
import pandas as pd

# Assuming df is your DataFrame containing the dataset
# Convert the 'hotel' column from a list to a string
df['hotel'] = df['hotel'].apply(lambda x: x[0])

# Group by 'hotel' and concatenate 'clean_text' for each hotel
hotel_clean_text = df.groupby('hotel')['clean_text'].apply(lambda x: ' '.join(x)).reset_index()

# Rename the columns for clarity
hotel_clean_text.columns = ['hotel', 'clean_text']

# Display the resulting DataFrame
hotel_clean_text

Unnamed: 0,hotel,combined_clean_text
0,1,where do even begin let just say my first back...
1,2,am far from complainer but wasn happy this con...
2,A,great value decent location highly recommend r...
3,B,made reservation about year in advance and dec...
4,C,wait until the last minute and get ticket pack...
5,D,family wa able to walk to all the restaurant a...
6,G,stayed in this hotel before and the room are n...
7,H,family reunion there three sibling and signifi...
8,L,on girl trip this past weekend we had wonderfu...
9,M,had very friendly reception we were expected w...


In [None]:
# Generate summaries
def generate_summary(hotel_name):
    hotel_text = hotel_clean_text.loc[hotel_clean_text['hotel'] == hotel_name, 'combined_clean_text'].iloc[0]
    hotel_seq = tokenizer.texts_to_sequences([hotel_text])
    hotel_pad = pad_sequences(hotel_seq, maxlen=max_length, padding='post')
    summary_seq = model.predict_classes(hotel_pad)
    summary = tokenizer.sequences_to_texts([summary_seq])[0]
    return summary

# Example usage:
hotel_name = "Limelight Hotel"
summary = generate_summary(hotel_name)
print(f"Summary for hotel '{hotel_name}': {summary}")


Unnamed: 0,stars,hotel,title,content
0,5,hotel,[Awesome visit ],[Went on a girls trip this past weekend. We ha...
1,5,hotel,[Super hotel and Super Staff],[We were very lucky to win 4 nights accomodati...
2,5,roaring fork,[Wait until the last minute],[Wait until the last minute and get a lodging/...
3,5,hotel,[Great Hotel. Nice place to stay],[Great hotel. Beautiful. Great well decorated ...
4,3,mountain lodge,[Quaint and cozy lodge],"[Great value, decent location. I'd highly reco..."
...,...,...,...,...
2001,5,blanc,[Great stay],[Spent a couple of nights in Aspen on a girls ...
2002,4,blanc,[Excellent cost/benefit],[We stayed in a two bedrooms/bathrooms apartme...
2003,5,blanc,[Great WInter Vaca],[A wonderful place to stay for our family vaca...
2004,5,blanc,[Chateau Blanc for a week],[The lodge is few blocks away from the main do...


In [13]:
# Convert the lists in 'hotel' column to strings
df['hotel'] = df['hotel'].apply(lambda x: x[0])  # Convert list to string
df['hotel'] = df['hotel'].str.lower()
# Combine 'hotel' and 'clear_text' into a single column
df['combined'] = df['hotel'] + " " + df['clean_text']

df

Unnamed: 0,stars,hotel,title,content,clean_text,rating,combined
0,5,l,[Awesome visit ],['Went on a girls trip this past weekend. We h...,on girl trip this past weekend we had wonderfu...,1,l on girl trip this past weekend we had wonder...
1,5,l,[Super hotel and Super Staff],['We were very lucky to win 4 nights accomodat...,were very lucky to win night accomodation at t...,1,l were very lucky to win night accomodation at...
2,5,c,[Wait until the last minute],"[""Wait until the last minute and get a lodging...",wait until the last minute and get ticket pack...,1,c wait until the last minute and get ticket pa...
3,5,l,[Great Hotel. Nice place to stay],['Great hotel. Beautiful. Great well decorated...,hotel beautiful great well decorated bar and n...,1,l hotel beautiful great well decorated bar and...
4,3,a,[Quaint and cozy lodge],"[""Great value, decent location. I'd highly rec...",great value decent location highly recommend r...,1,a great value decent location highly recommend...
...,...,...,...,...,...,...,...
2001,5,c,[Great stay],['Spent a couple of nights in Aspen on a girls...,couple of night in aspen on girl getaway our t...,1,c couple of night in aspen on girl getaway our...
2002,4,c,[Excellent cost/benefit],['We stayed in a two bedrooms/bathrooms apartm...,stayed in two apartment the apartment had pret...,1,c stayed in two apartment the apartment had pr...
2003,5,c,[Great WInter Vaca],['A wonderful place to stay for our family vac...,wonderful place to stay for our family vacatio...,1,c wonderful place to stay for our family vacat...
2004,5,c,[Chateau Blanc for a week],"[""The lodge is few blocks away from the main d...",the lodge is few block away from the main down...,1,c the lodge is few block away from the main do...
