In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import glob

def load_aspect_data(path):
    all_files = glob.glob(path)
    all_data = []
    for filename in all_files:
        df = pd.read_csv(filename)
        all_data.append(df)
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"Loaded {len(all_files)} CSV files.")
    return combined_df

aspect_path = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\*.csv"
aspect_df = load_aspect_data(aspect_path)

# Print information about the loaded data
print(f"Total number of rows: {len(aspect_df)}")
print(f"Columns: {aspect_df.columns.tolist()}")
print("\nFirst few rows:")
print(aspect_df.head())

print("\nData types and non-null counts:")
print(aspect_df.info())

# Check unique values in the Classification column
print("\nUnique values in Classification column:")
print(aspect_df['Classification'].unique())

# Define the specific KPIs we're interested in
specific_kpis = ['food', 'staff', 'comfort & facilities', 'value for money']

# Function to check if a KPI is in the classification
def check_kpi(classification, kpi):
    if isinstance(classification, str):  # Check if the value is a string
        return 1 if kpi.lower() in classification.lower() else 0
    return 0  # If it's not a string, return 0

# Create binary columns for each specific KPI
for kpi in specific_kpis:
    aspect_df[kpi] = aspect_df['Classification'].apply(lambda x: check_kpi(x, kpi))

# Prepare the features (X) and labels (y)
X = aspect_df['Opinion'].values
y = aspect_df[specific_kpis].values

# Tokenize and pad the text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=200)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Build the model
input_layer = Input(shape=(200,))
embedding_layer = Embedding(10000, 100, input_length=200)(input_layer)
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(lstm_layer)
global_max_pool = GlobalMaxPooling1D()(lstm_layer)
dense_layer = Dense(64, activation='relu')(global_max_pool)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(len(specific_kpis), activation='sigmoid')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_split=0.1, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy:.4f}")

# Function to predict KPI likelihoods for new data
def predict_kpi_likelihoods(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)[0]
    return {kpi: float(likelihood) for kpi, likelihood in zip(specific_kpis, prediction)}

# Test the model on a sample review
sample_review = "The room was clean and comfortable, but the staff was not very friendly. The food was excellent."
predicted_likelihoods = predict_kpi_likelihoods(sample_review)
print("\nPredicted KPI likelihoods for the sample review:")
for kpi, likelihood in predicted_likelihoods.items():
    print(f"{kpi}: {likelihood:.2%}")

# # Function to predict KPIs for your actual dataset
# def predict_kpis_for_dataset(df, text_column):
#     X_new = df[text_column].values
#     X_new_sequences = tokenizer.texts_to_sequences(X_new)
#     X_new_padded = pad_sequences(X_new_sequences, maxlen=200)
#     predictions = model.predict(X_new_padded)
    
#     for i, kpi in enumerate(specific_kpis):
#         df[f'{kpi}_likelihood'] = predictions[:, i]
    
#     return df

# Use this function on your actual dataset
# actual_df = pd.read_csv("your_actual_dataset.csv")
# actual_df = predict_kpis_for_dataset(actual_df, 'Review Content')
# print(actual_df[['Review Content'] + [f'{kpi}_likelihood' for kpi in specific_kpis]].head())

Loaded 29 CSV files.
Total number of rows: 8626
Columns: ['Opinion', 'Classification']

First few rows:
                                             Opinion  \
0  We stayed for a week and could not fault it at...   
1  This resort is beautiful. The rooms are fabulo...   
2  i never fail to visit Shangrila Boracay eveyti...   
3  This is really a 4.5 star review. Had a chance...   
4  Transfers - On arrival at the airport we were ...   

                       Classification  
0                      Staff:Location  
1           Food:Comfort & Facilities  
2                       Location:Food  
3  Comfort & Facilities:Food:Location  
4     Comfort & Facilities:Food:Staff  

Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8626 entries, 0 to 8625
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Opinion         8626 non-null   object
 1   Classification  8508 non-null   object
dtyp



Epoch 1/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 107ms/step - accuracy: 0.4172 - loss: 0.6287 - val_accuracy: 0.6623 - val_loss: 0.5059
Epoch 2/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 110ms/step - accuracy: 0.6343 - loss: 0.4968 - val_accuracy: 0.6768 - val_loss: 0.4175
Epoch 3/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 152ms/step - accuracy: 0.6531 - loss: 0.3956 - val_accuracy: 0.6826 - val_loss: 0.3754
Epoch 4/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 130ms/step - accuracy: 0.6499 - loss: 0.3032 - val_accuracy: 0.6072 - val_loss: 0.3770
Epoch 5/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 127ms/step - accuracy: 0.6254 - loss: 0.2433 - val_accuracy: 0.6377 - val_loss: 0.3559
Epoch 6/100
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 109ms/step - accuracy: 0.6335 - loss: 0.1966 - val_accuracy: 0.5913 - val_loss: 0.3979
Epoc