In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import optimizers, losses, Model
import tensorflow.keras.layers as L
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.callbacks import ModelCheckpoint

# Load datasets

# training_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval14_res/Train/Restaurants_Train.csv")
# validation_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval14_res/Test/Restaurants_Test.csv")

training_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval15/Train/Restaurants_Train.csv")
validation_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval15/Test/Restaurants_Test.csv")

# training_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval16/Train/Restaurants_Train.csv")
# validation_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval16/Test/Restaurants_Test.csv")

# Drop 'conflict' polarity entries
training_df.drop(training_df[training_df['polarity'] == 'conflict'].index, inplace=True)
validation_df.drop(validation_df[validation_df['polarity'] == 'conflict'].index, inplace=True)

# Prepare labels
y_train = pd.get_dummies(training_df['polarity']).values
y_valid = pd.get_dummies(validation_df['polarity']).values

import pandas as pd

# Assuming your DataFrame is named df
class_counts = validation_df['polarity'].value_counts()

# Print class counts and their names
for class_name, count in class_counts.items():
    print(f"Class '{class_name}': {count} instances")


# # Initialization WordNetLemmatizer
# import re
# from nltk.stem import WordNetLemmatizer
# from bs4 import BeautifulSoup
# import emoji
# import string
# # Initialize the WordNetLemmatizer
# wnl = WordNetLemmatizer()

# def preprocess_text(text):
#     """
#     Performs a series of preprocessing steps on the input text.
#     Args:
#         text (str): The input text to be preprocessed.
#     Returns:
#         str: The preprocessed text.
#     """
#     # Remove HTML tags using BeautifulSoup.
#     text = BeautifulSoup(text, "html.parser").get_text()
#     # Remove content within square brackets.
#     text = re.sub('\[[^]]*\]', '', text)
#     # Expand contractions.
#     #text = decontraction(text)
#     # Remove emojis.
#     #text = remove_emojis(text)
#     # Remove URLs, usernames, and similar patterns.
#     text = re.sub(r'https?://\S+|www\.\S+|@[^\s]+', '', text)
#     # Remove punctuation using a translation table.
#     text = text.translate(str.maketrans('', '', string.punctuation))
    
#     # Replace sequences of the same character where length > 2 with two characters.
#     #text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    
#     # Remove non-alphabetical characters.
#     text = re.sub(r'[^a-zA-Z\s]', ' ', text)
#     # Convert to lowercase to normalize the text.
#     text = text.lower()
#     return text

# # Note: This function now includes lemmatization in the preprocessing pipeline.
# # preprocess_text--- data cleaning with lemmatization
# training_df['Sentence'] = training_df['Sentence'].apply(preprocess_text)
# validation_df['Sentence'] = validation_df['Sentence'].apply(preprocess_text)











# Tokenizer and Transformer configuration
transformer_model = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)
seq_len = 512  # Max sequence length
batch_size = 8

# Function to emphasize aspect terms in sentences
def emphasize_aspect_terms(sentence, aspect):
    # Example strategy: Repeat the aspect term twice and wrap with special tokens
    emphasized_sentence = sentence.replace(aspect, f"[ASP] {aspect} {aspect} [/ASP]")
    return emphasized_sentence

# Apply emphasis on aspect terms
training_df['Emphasized_Sentence'] = training_df.apply(lambda x: emphasize_aspect_terms(x['Sentence'], x['Aspect Term']), axis=1)
validation_df['Emphasized_Sentence'] = validation_df.apply(lambda x: emphasize_aspect_terms(x['Sentence'], x['Aspect Term']), axis=1)

# Tokenization
tokenized_inputs_train = tokenizer(training_df['Emphasized_Sentence'].tolist(), max_length=seq_len, truncation=True, padding='max_length', return_tensors='tf')
tokenized_inputs_valid = tokenizer(validation_df['Emphasized_Sentence'].tolist(), max_length=seq_len, truncation=True, padding='max_length', return_tensors='tf')

# Model Building
encoder = TFAutoModel.from_pretrained(transformer_model)

input_ids = L.Input(shape=(seq_len,), dtype=tf.int32, name="input_ids")
attention_mask = L.Input(shape=(seq_len,), dtype=tf.int32, name="attention_mask")
embeddings = encoder(input_ids, attention_mask=attention_mask)[0]
pooled_output = L.GlobalAveragePooling1D()(embeddings)
outputs = L.Dense(y_train.shape[1], activation='softmax')(pooled_output)

model = Model(inputs=[input_ids, attention_mask], outputs=outputs)
model.compile(optimizer=optimizers.Adam(learning_rate=1e-5), loss=losses.CategoricalCrossentropy(from_logits=False), metrics=['accuracy'])

# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_train['input_ids'], 'attention_mask': tokenized_inputs_train['attention_mask']}, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_valid['input_ids'], 'attention_mask': tokenized_inputs_valid['attention_mask']}, y_valid))

model.summary()

Class 'positive': 326 instances
Class 'negative': 182 instances
Class 'neutral': 34 instances


Some layers from the model checkpoint at distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 768)          0           tf_roberta_model_1[0][0]   

In [23]:
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from tensorflow.keras import optimizers, losses, Model
# import tensorflow.keras.layers as L
# from transformers import AutoTokenizer, TFAutoModel
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Load datasets

# training_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval14_res/Train/Restaurants_Train.csv")
# validation_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval14_res/Test/Restaurants_Test.csv")

# # training_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval15/Train/Restaurants_Train.csv")
# # validation_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval15/Test/Restaurants_Test.csv")

# # training_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval16/Train/Restaurants_Train.csv")
# # validation_df = pd.read_csv("/kaggle/input/sentiment-classification/SemEval16/Test/Restaurants_Test.csv")

# # training_df = pd.read_csv("/kaggle/input/sentiment-classification/Sentihood/Train/Sentihood_train.csv")
# # validation_df = pd.read_csv("/kaggle/input/sentiment-classification/Sentihood/Test/Sentihood_test.csv")


# # Drop 'conflict' polarity entries
# training_df.drop(training_df[training_df['polarity'] == 'conflict'].index, inplace=True)
# validation_df.drop(validation_df[validation_df['polarity'] == 'conflict'].index, inplace=True)

# # Prepare labels
# y_train = pd.get_dummies(training_df['polarity']).values
# y_valid = pd.get_dummies(validation_df['polarity']).values

# import pandas as pd

# # Assuming your DataFrame is named df
# class_counts = training_df['polarity'].value_counts()

# # Print class counts and their names
# for class_name, count in class_counts.items():
#     print(f"Class '{class_name}': {count} instances")


In [24]:
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Define a checkpoint callback
# checkpoint_path = "/kaggle/working/best_model_tr.h5"
# checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# # Training
# model.fit(train_dataset.shuffle(10000).batch(batch_size), 
#           validation_data=valid_dataset.batch(batch_size), 
#           epochs=10,
#           verbose=1,
#           callbacks=[checkpoint])  # Add checkpoint callback


In [25]:
# from transformers import TFRobertaModel  # Import the necessary layer
# from tensorflow.keras.models import load_model

# # Load the model with custom objects argument
# custom_objects = {'TFRobertaModel': TFRobertaModel}
# model = load_model("/kaggle/working/best_model_tr.h5", custom_objects=custom_objects)


In [26]:
# Making predictions on the validation dataset
y_pred = model.predict(valid_dataset.batch(batch_size))

In [27]:
print(y_pred.shape)

(542, 3)


In [28]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score, precision_score, recall_score


# Converting probabili৮ties to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_valid, axis=1)


# Computing F1 score
f1 = f1_score(y_true_labels, y_pred_labels, average='weighted')
print("F1 Score:", f1)

# Computing precision
precision = precision_score(y_true_labels, y_pred_labels, average='weighted')
print("Precision:", precision)

# Computing recall
recall = recall_score(y_true_labels, y_pred_labels, average='weighted')
print("Recall:", recall)

F1 Score: 0.16882428492793214
Precision: 0.11275717923230894
Recall: 0.33579335793357934


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# F1 Score: 0.7940415317576729
# Precision: 0.799109346633389
# Recall: 0.7978056426332288

In [30]:
# Access the layers of the model
layers = model.layers
output_tensor = layers[-2].output
new_model = Model(inputs=model.input, outputs=output_tensor)
new_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 768)          0           tf_roberta_model_1[0][0]   

In [31]:
# # Making predictions on the validation dataset
# y_pred = new_model.predict(valid_dataset.batch(batch_size))
# print(y_pred.shape)

In [32]:
new_traning_data=new_model.predict(train_dataset.batch(batch_size))
new_valid_dataset=new_model.predict(valid_dataset.batch(batch_size))

In [33]:
# from sklearn.svm import SVC
# from sklearn.metrics import f1_score

# # Assuming new_traning_data and new_valid_dataset are features extracted by your neural network
# # and train_labels, valid_labels are the corresponding labels
# svm = SVC(kernel='rbf')  # You can choose different kernels like 'rbf', 'poly', etc.

# # Reshape the data if necessary
# new_traning_data = new_traning_data.reshape(new_traning_data.shape[0], -1)
# new_valid_dataset = new_valid_dataset.reshape(new_valid_dataset.shape[0], -1)
# y_train_single = np.argmax(y_train, axis=1)

# # Train the SVM
# svm.fit(new_traning_data, y_train_single)

# # Evaluate the SVM on the validation dataset
# svm_predictions = svm.predict(new_valid_dataset)
# y_valid_single = np.argmax(y_valid, axis=1)
# # Calculate F1 score
# f1 = f1_score(y_valid_single, svm_predictions, average='micro')

# print("SVM F1 Score on validation dataset:", f1)


In [34]:
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Compute confusion matrix
# conf_matrix = confusion_matrix(y_valid_single, svm_predictions)

# plt.figure(figsize=(10, 8))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 18}) # Set annotation font size
# plt.xlabel('Predicted labels', fontsize=18) # Set x-axis label font size
# plt.ylabel('True labels', fontsize=18) # Set y-axis label font size
# plt.title('Confusion Matrix', fontsize=20) # Set title font size
# plt.xticks(fontsize=18) # Set x-axis tick font size
# plt.yticks(fontsize=18) # Set y-axis tick font size
# plt.show()


In [35]:
!pip install stellargraph

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [36]:
X, y = np.concatenate((new_traning_data, new_valid_dataset), axis=0), np.concatenate((y_train, y_valid), axis=0)

In [37]:
print(new_traning_data.shape)
print(new_valid_dataset.shape)

(1204, 768)
(542, 768)


In [46]:
import stellargraph as sg
from stellargraph.layer import GraphSAGE
from stellargraph.mapper import GraphSAGENodeGenerator
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import pandas as pd
from tensorflow.keras.callbacks import ModelCheckpoint

df = pd.DataFrame(X)

# Construct a graph from the DataFrame
graph = sg.StellarGraph(nodes=df)

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.index, y, test_size=0.2160, shuffle=False)



# Define the GraphSAGE model
generator = GraphSAGENodeGenerator(graph, batch_size=8, num_samples=[10, 5])
train_gen = generator.flow(X_train, y_train)

graphsage_model = GraphSAGE(
    layer_sizes=[64, 32],
    generator=generator,
    bias=True,
    dropout=0.2,
    normalize="l2"
)

# Build the model
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=3, activation="softmax")(x_out)

model = models.Model(inputs=x_inp, outputs=prediction)
learning_rate = 0.001  # Set your desired learning rate here

optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
model.summary()


# Define a checkpoint callback
checkpoint_path = "/kaggle/working/best_model_gnn.h5"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True)



Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 10, 768)]    0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 50, 768)]    0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 1, 768)]     0                                            
__________________________________________________________________________________________________
reshape_12 (Reshape)            (None, 1, 10, 768)   0           input_11[0][0]                   
____________________________________________________________________________________________

In [66]:
# Train the model using the generator
history = model.fit(train_gen,
                    epochs=5,
                    validation_data=generator.flow(X_test, y_test),
                    verbose=1,
                    callbacks=[checkpoint])  # Add checkpoint callback

# Evaluate the model
test_loss, test_acc = model.evaluate(generator.flow(X_test, y_test))
print('Test accuracy:', test_acc)

# Making predictions on the validation dataset
y_pred = model.predict(generator.flow(X_test, y_test))

from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score, precision_score, recall_score


# Converting probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)


# Computing F1 score
f1 = f1_score(y_true_labels, y_pred_labels, average='weighted')
print("F1 Score:", f1)

# Computing precision
precision = precision_score(y_true_labels, y_pred_labels, average='weighted')
print("Precision:", precision)

# Computing recall
recall = recall_score(y_true_labels, y_pred_labels, average='weighted')
print("Recall:", recall)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.7936508059501648
F1 Score: 0.7686060020716105
Precision: 0.807033149498903
Recall: 0.7936507936507936


In [68]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score, precision_score, recall_score


# Converting probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)


# Computing F1 score
f1 = f1_score(y_true_labels, y_pred_labels, average='weighted')
print("F1 Score:", f1)

# Computing precision
precision = precision_score(y_true_labels, y_pred_labels, average='weighted')
print("Precision:", precision)

# Computing recall
recall = recall_score(y_true_labels, y_pred_labels, average='weighted')
print("Recall:", recall)

F1 Score: 0.7686060020716105
Precision: 0.807033149498903
Recall: 0.7936507936507936
