### DATA PREPROCESSING

In [None]:
import os
import csv
import xml.etree.ElementTree as ET

# Mapping of source XML files to their destination folders and output filenames.
# This dictionary drives the entire script.
file_mappings = {
    'data/processed/laptop_14': {
        'data_training': 'data/raw/SemEval 2014 laptops - training.xml',
        'data_validation': 'data/raw/SemEval 2014 laptops - validation.xml'
    },
    'data/processed/restaurants_14': {
        'data_training': 'data/raw/SemEval 2014 restaurants - training.xml',
        'data_validation': 'data/raw/SemEval 2014 restaurants - validation.xml'
    },
    'data/processed/restaurants_15': {
        'data_training': 'data/raw/SemEval 2015 restaurants - training.xml',
        'data_validation': 'data/raw/SemEval 2015 restaurants - validation.xml'
    },
    'data/processed/restaurants_16': {
        'data_training': 'data/raw/SemEval16_Restaurants_Train.xml',
        'data_validation': 'data/raw/SemEval16_Restaurants_Test.xml'
    }
}

print("Starting the XML to CSV conversion process...")

# --- Main script execution starts here ---

# Iterate through the top-level keys of the mapping, which represent the folders to be created.
for output_folder, files in file_mappings.items():
    # Create the destination folder (e.g., 'laptop_14') if it doesn't already exist.
    os.makedirs(output_folder, exist_ok=True)
    print(f"\nCreated/Verified folder: '{output_folder}'")

    # Iterate through the files specified for the current folder.
    for output_name, input_xml in files.items():
        # Construct the full path for the output CSV file (e.g., 'laptop_14/data_training.csv').
        output_csv_path = os.path.join(output_folder, f"{output_name}.csv")
        
        print(f"  - Parsing '{input_xml}' -> '{output_csv_path}'")

        # Use a try-except block to handle potential errors like a missing file or malformed XML.
        try:
            tree = ET.parse(input_xml)
            root = tree.getroot()
        except (ET.ParseError, FileNotFoundError) as e:
            print(f"    ERROR: Could not process file {input_xml}. Reason: {e}")
            # If an error occurs, skip this file and continue with the next one.
            continue

        # Open the target CSV file in write mode.
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
            csv_writer = csv.writer(csv_file)
            # Write the header row for the CSV.
            csv_writer.writerow(['sentence', 'aspect_term', 'sentiment', 'from', 'to'])

            # Determine how to find sentences based on the XML root tag.
            # Some files use <sentences> as the root, others use <Reviews>.
            sentences_to_process = []
            if root.tag == 'sentences':
                sentences_to_process = root.findall('sentence')
            elif root.tag == 'Reviews':
                for review in root.findall('Review'):
                    sentences_tag = review.find('sentences')
                    if sentences_tag is not None:
                        sentences_to_process.extend(sentences_tag.findall('sentence'))

            # Process each sentence that was found.
            for sentence in sentences_to_process:
                text_element = sentence.find('text')
                if text_element is None or text_element.text is None:
                    continue
                sentence_text = text_element.text.strip()

                # Handle the structure used in 2014 datasets: <aspectTerms><aspectTerm/></aspectTerms>
                aspect_terms_element = sentence.find('aspectTerms')
                if aspect_terms_element is not None:
                    for aspect_term in aspect_terms_element.findall('aspectTerm'):
                        term = aspect_term.get('term')
                        polarity = aspect_term.get('polarity')
                        from_ = aspect_term.get('from')
                        to_ = aspect_term.get('to')
                        if term and polarity:
                            csv_writer.writerow([sentence_text, term, polarity, from_, to_])
                    

                # Handle the structure used in 2015/2016 datasets: <Opinions><Opinion/></Opinions>
                opinions_element = sentence.find('Opinions')
                if opinions_element is not None:
                    for opinion in opinions_element.findall('Opinion'):
                        target = opinion.get('target')
                        polarity = opinion.get('polarity')
                        from_ = aspect_term.get('from')
                        to_ = aspect_term.get('to')
                        # Write row only if the target is not 'NULL'
                        if target and target.lower() != 'null' and polarity:
                            csv_writer.writerow([sentence_text, target, polarity, from_, to_])

print("\nProcessing complete. All files have been converted.")


### ZERO SHOT

In [None]:
from huggingface_hub import InferenceClient
import csv
import time
import os

start = time.time()
for model_ in ['meta-llama/Meta-Llama-3-8B-Instruct', 'meta-llama/Meta-Llama-3-70B-Instruct', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'meta-llama/Meta-Llama-3.3-70B-Instruct']:
# Initialize the client with the model you want to use
    client = InferenceClient(model=model_, token="YOUR HUGGINGFACE TOKEN")

    for dataset in ['laptop_14', 'restaurants_14', 'restaurants_15', 'restaurants_16']:
        # interate through data_validation.csv
        data_validation_path = os.path.join('data', 'processed', dataset, 'data_validation.csv')
        with open(data_validation_path, 'r', newline='', encoding='utf-8') as infile:
            csv_reader = csv.DictReader(infile)
            
            # Prepare the messages for the chat completion
            sentiments_original = []
            sentiments_predicted = []
            # loop through ten first rows
            for row in csv_reader:
                sentence_text = row.get('sentence')
                term = row.get('aspect_term')
                original_polarity = row.get('sentiment')
                from_ = row.get('from')
                to_ = row.get('to')

                if not sentence_text or not term or not original_polarity:
                    continue

                prompt = f"""
        Instruction:
        Analyze the sentiment of the aspect term within the given sentence. The aspect term is highlighted by quotes. Your answer must be one of the following three options: 'positive', 'negative', 'neutral'. Do not provide any explanation or other text.
        Sentence:
        "{sentence_text}"
        Aspect Term:
        "{term}"

        Location of the aspect term in the sentence is from {from_} character to {to_} character.
        
        Sentiment:
        """
                sentiments_original.append(original_polarity)

                # Send the prompt
                response = client.chat_completion(messages=[{"role": "user", "content": prompt}], temperature=0.0)

                sentiments_predicted.append(response.choices[0].message["content"])
                time.sleep(0.5)

        # calculate accuracy
        correct_predictions = sum(
            1 for original, predicted in zip(sentiments_original, sentiments_predicted)
            if original.lower() == predicted.lower()
        )
        accuracy = correct_predictions / len(sentiments_original) * 100

        # calculate F1 score
        from sklearn.metrics import f1_score
        # Convert sentiments to numerical values for F1 score calculation
        sentiment_map = {'positive': 1, 'negative': -1, 'neutral': 0}
        y_true = [sentiment_map[sentiment.lower()] for sentiment in sentiments_original]
        y_pred = [sentiment_map[sentiment.lower()] for sentiment in sentiments_predicted]
        f1 = f1_score(y_true, y_pred, average='weighted')

        print(f"\n\n________________________")
        print(f"Model: {model_}")
        print(f"Dataset: {dataset}")
        print(f"Accuracy: {accuracy:.2f}%")
        print(f"F1 Score: {f1:.4f}")

end = time.time()
print(f"\n\nTotal execution time: {end - start:.2f} seconds")

### FEW SHOT

In [None]:
from huggingface_hub import InferenceClient
import csv
import time
import os

start = time.time()
for model_ in ['meta-llama/Meta-Llama-3-8B-Instruct', 'meta-llama/Meta-Llama-3-70B-Instruct', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'meta-llama/Meta-Llama-3.3-70B-Instruct']:
# Initialize the client with the model you want to use
    client = InferenceClient(model=model_, token="YOUR HUGGINGFACE TOKEN")

    for dataset in ['laptop_14', 'restaurants_14', 'restaurants_15', 'restaurants_16']:
        # interate through data_validation.csv
        data_validation_path = os.path.join('data', 'processed', dataset, 'data_validation.csv')
        with open(data_validation_path, 'r', newline='', encoding='utf-8') as infile:
            csv_reader = csv.DictReader(infile)
            
            # Prepare the messages for the chat completion
            sentiments_original = []
            sentiments_predicted = []
            # loop through ten first rows
            for row in csv_reader:
                sentence_text = row.get('sentence')
                term = row.get('aspect_term')
                original_polarity = row.get('sentiment')
                from_ = row.get('from')
                to_ = row.get('to')

                if not sentence_text or not term or not original_polarity:
                    continue

                few_shot_prompt = f"""
Instruction:
Analyze the sentiment of the aspect term within the given sentence. The aspect term is highlighted by quotes. Your answer must be one of the following three options: 'positive', 'negative', 'neutral'. Do not provide any explanation or other text.

[Example 1]
Sentence: "The ambiance is fantastic and the food is even better."
Aspect Term: "food"
Location of the aspect term in the sentence is from 35 character to 38 character.
Sentiment:
positive

[Example 2]
Sentence: "While the staff was friendly, the prices were outrageous."
Aspect Term: "prices"
Location of the aspect term in the sentence is from 35 character to 40 character.
Sentiment:
negative

[Example 3]
Sentence: "The laptop's screen is 14 inches."
Aspect Term: "screen"
Location of the aspect term in the sentence is from 14 character to 19 character.
Sentiment:
neutral

[Example 4]
Sentence: "The service was exceptionally quick and attentive."
Aspect Term: "service"
Location of the aspect term in the sentence is from 5 character to 11 character.
Sentiment:
positive

[Example 5]
Sentence: "I found the keyboard to be a bit cramped and uncomfortable for long typing sessions."
Aspect Term: "keyboard"
Location of the aspect term in the sentence is from 13 character to 20 character.
Sentiment:
negative

[Example 6]
Sentence: "The restaurant is located on the main street."
Aspect Term: "restaurant"
Location of the aspect term in the sentence is from 5 character to 14 character.
Sentiment:
neutral

[Task]
Sentence: "{sentence_text}"
Aspect Term: "{term}"
Location of the aspect term in the sentence is from {from_} character to {to_} character.
Sentiment:
"""
                sentiments_original.append(original_polarity)

                # Send the prompt
                response = client.chat_completion(messages=[{"role": "user", "content": few_shot_prompt}], temperature=0.0)

                sentiments_predicted.append(response.choices[0].message["content"])
                time.sleep(0.5)

        # calculate accuracy
        correct_predictions = sum(
            1 for original, predicted in zip(sentiments_original, sentiments_predicted)
            if original.lower() == predicted.lower()
        )
        accuracy = correct_predictions / len(sentiments_original) * 100

        # calculate F1 score
        from sklearn.metrics import f1_score
        # Convert sentiments to numerical values for F1 score calculation
        sentiment_map = {'positive': 1, 'negative': -1, 'neutral': 0}
        y_true = [sentiment_map[sentiment.lower()] for sentiment in sentiments_original]
        y_pred = [sentiment_map[sentiment.lower()] for sentiment in sentiments_predicted]
        f1 = f1_score(y_true, y_pred, average='weighted')

        print(f"\n\n________________________")
        print(f"Model: {model_}")
        print(f"Dataset: {dataset}")
        print(f"Accuracy: {accuracy:.2f}%")
        print(f"F1 Score: {f1:.4f}")

end = time.time()
print(f"\n\nTotal execution time: {end - start:.2f} seconds")