# Part 1

In [1]:
import pandas as pd

In [2]:
def file_to_df(file_path):
    """
    Read data from a file, process it, and create a Pandas DataFrame.

    Parameters:
    file_path (str): The path to the file containing the data.

    Returns:
    DataFrame: A DataFrame containing processed data with columns 'x' and 'y'.
    """

    # Opens the file using file_path function. 
    with open(file_path, 'r', encoding='utf-8') as file:
        # Reads lines of file using readlines()
        lines = file.readlines()
        # So, the entire expression takes each line, removes leading/trailing whitespace,
        # splits it into parts based on spaces (up to 2 splits), and then extracts the first two parts.
        # The result is a list of pairs of values from each line in the lines list. 
        data = [line.strip().split(' ', maxsplit=2)[:2] for line in lines]

    # Creating dataframe from the data list with 2 columns, x and y.
    df = pd.DataFrame(data, columns=['x', 'y'])

    # Drop rows where value for y = None 
    df = df.dropna(subset=['y'])

    # Convert the data type of columns 'x' and 'y' to strings
    df['x'] = df['x'].astype(str)
    df['y'] = df['y'].astype(str)

    # Display the dataframe
    return df

# file_to_df function called with the file path
file_path_train_es = 'Data/ES/train'

# Stores resulting dataframe in df_train_es
df_train_es = file_to_df(file_path_train_es)

In [3]:
def count_y(df, y_value):
    """
    Count the occurrences of a specific value in the 'y' column of a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    y_value: The value for which the count is to be determined.

    Returns:
    int: The count of occurrences of the specified y_value.
    """

    # Calculate the count of each unique value in the 'y' column and convert it to a dictionary.
    unique_counts = df['y'].value_counts().to_dict()

    # Return the count of occurrences of the specified y_value
    print("unique counts:", unique_counts)
    return unique_counts[y_value]

In [4]:
def y_star_with_smallest_count(df):
    """
    Find the unique value(s) in the 'y_star' column of a DataFrame that have the smallest count.

    Parameters:
    df (DataFrame): The DataFrame containing the data.

    Returns:
    The unique value(s) in the 'y_star' column with the smallest count.
    """

    # Calculate the count of each unique value in the 'y_star' column and convert it to a dictionary.
    unique_counts = df['y_star'].value_counts().to_dict()
    
    # Find the smallest count value among the unique counts.
    min_count = min(unique_counts.values())

    # Construct a list of unique value(s) with the smallest count.
    y_with_min_count = [key for key, value in unique_counts.items() if value == min_count]

    # Return the first value from the list (if there are multiple values with the smallest count).
    return y_with_min_count[0]

In [5]:
def create_df_filtered_for_y_value(df, y_value):
    """
    Create a filtered DataFrame containing rows where the 'y' column matches a specific value.

    Parameters:
    df (DataFrame): The original DataFrame containing the data.
    y_value: The value to filter rows based on in the 'y' column.

    Returns:
    DataFrame: A new DataFrame with rows filtered for the specified y_value in the 'y' column.
    """

    return df[df['y'] == y_value]

In [6]:
def create_df_x_count_y_to_x(df):
    """
    Create a DataFrame that shows the count of occurrences of each 'x' value in the original DataFrame.

    Parameters:
    df (DataFrame): The original DataFrame containing the data.

    Returns:
    DataFrame: A new DataFrame with 'x' values and their corresponding counts.
    """

    # Calculate the count of occurrences of each 'x' value and reset the index.
    df_x_count_y_to_x = df['x'].value_counts().reset_index()

    # Rename columns to 'x' and 'count_y_to_x' for clarity.
    df_x_count_y_to_x.columns = ['x', 'count_y_to_x']

    return df_x_count_y_to_x

In [7]:
def create_ls_of_all_y_values(df):
    """
    Create a list of all unique values in the 'y' column of the DataFrame.

    Parameters:
    df (DataFrame): The DataFrame containing the data.

    Returns:
    list: A list containing all unique values in the 'y' column.
    """

    # Retrieve the unique values from the 'y' column.
    unique_values = df['y'].unique()

    # Return the list of unique 'y' values.
    return unique_values
    
print(create_ls_of_all_y_values(df_train_es))

['O' 'B-positive' 'B-negative' 'B-neutral' 'I-neutral' 'I-positive'
 'I-negative']


In [8]:
def create_df_e_x_y_train(train_df, y_value):
    """
    Create a DataFrame containing conditional probabilities and 'y' values for specific conditions.

    Parameters:
    train_df (DataFrame): The training DataFrame containing the data.
    y_value: The specific 'y' value for which conditional probabilities are calculated.

    Returns:
    DataFrame: A new DataFrame with conditional probabilities and 'y' values.
    """

    # Filter the training DataFrame for the specified y_value.
    df_train_filtered_for_y = create_df_filtered_for_y_value(train_df, y_value)

    # Create a DataFrame showing the count of occurrences of each 'x' value for the specified y_value.
    df_e_x_y_train = create_df_x_count_y_to_x(df_train_filtered_for_y)

    # Conditional probability formula - P(x|y) = P(x ∩ y) / P(y)
    # Calculate conditional probabilities 'e(x|y)' by dividing
    # the count of occurrences of each 'x' value for the specific 'y' value.
    # by the count of occurrences of the specific y_value
    df_e_x_y_train['e(x|y)'] = df_e_x_y_train['count_y_to_x'] / (count_y(train_df, y_value))

    # Add 'y' column to the DataFrame with the specified y_value.
    df_e_x_y_train['y'] = y_value
    
    return df_e_x_y_train

# Create a DataFrame containing conditional probabilities for a specific 'y' value.
df_e_x_y_train_for_I_neutral = create_df_e_x_y_train(df_train_es, "B-positive")

# Print the DataFrame showing conditional probabilities.
print(df_e_x_y_train_for_I_neutral)

# Calculate and print the sum of conditional probabilities from the DataFrame. 
# Should add to 1 according to the characteristics of a probability distribution.
print(df_e_x_y_train_for_I_neutral['e(x|y)'].sum())

unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
               x  count_y_to_x    e(x|y)           y
0         comida           169  0.145690  B-positive
1       servicio           122  0.105172  B-positive
2    restaurante            46  0.039655  B-positive
3          trato            44  0.037931  B-positive
4       ambiente            33  0.028448  B-positive
..           ...           ...       ...         ...
280       dorada             1  0.000862  B-positive
281      detalle             1  0.000862  B-positive
282   cantidades             1  0.000862  B-positive
283       Atteca             1  0.000862  B-positive
284        menus             1  0.000862  B-positive

[285 rows x 4 columns]
1.0000000000000002


In [9]:
def create_df_e_x_y_test(test_df, y_value, train_df):
    """
    Create a DataFrame containing conditional probabilities 'e(x|y)' for a specific 'y' value using test and training data.

    Parameters:
    test_df (DataFrame): The test DataFrame containing the data to be evaluated.
    y_value: The specific 'y' value for which conditional probabilities are calculated.
    train_df (DataFrame): The training DataFrame containing the data used for reference.

    Returns:
    DataFrame: A new DataFrame with 'x' values, calculated conditional probabilities 'e(x|y)', and a sum of probabilities.
    """

    # Initializing k = 1
    k = 1

    # Filtering the test DataFrame for the specified 'y' value
    df_test_filtered_for_y = create_df_filtered_for_y_value(test_df, y_value)

    # Creating a DataFrame with count of occurrences of each 'x' value for the specified 'y' value
    # for the filtered test DataFrame
    df_e_x_y_test = create_df_x_count_y_to_x(df_test_filtered_for_y)

    # Calculating the total count of 'y' values plus a constant k (1)
    count_y_plus_k = count_y(test_df, y_value) + k

    # Extracting the 'x' values from the training DataFrame
    train_df_x_values = train_df['x'].tolist()

    # Handling unknown 'x' values by replacing them with "#UNK#" if not present in the training set
    df_e_x_y_test['x'] = df_e_x_y_test['x'].apply(lambda x: x if x in train_df_x_values else "#UNK#")

    # Calculating conditional probabilities 'e(x|y)' based on the 'x' values
    df_e_x_y_test['e(x|y)'] = df_e_x_y_test.apply(lambda row: (row['count_y_to_x'] / count_y_plus_k) if row['x'] != '#UNK#' else (k / count_y_plus_k), axis=1)

    # Printing the sum of conditional probabilities
    # Should add to 1 according to the characteristics of the probability distribution
    print(df_e_x_y_test['e(x|y)'].sum())

    # Returning the DataFrame containing calculated conditional probabilities
    return df_e_x_y_test

# Define the file path for the test data
file_path_test_es = 'Data/ES/dev.out'

# Load the test data from the specified file path into a DataFrame
df_test_es = file_to_df(file_path_test_es)

# Create a DataFrame containing conditional probabilities for a specific 'y' value using test and training data
df_e_x_y_test_for_I_neutral = create_df_e_x_y_test(df_test_es, "B-positive", df_train_es)

# Print the DataFrame showing calculated conditional probabilities
print(df_e_x_y_test_for_I_neutral)

unique counts: {'O': 3998, 'B-positive': 160, 'B-negative': 61, 'I-positive': 49, 'I-negative': 36, 'B-neutral': 8}
0.9937888198757765
               x  count_y_to_x    e(x|y)
0       servicio            24  0.149068
1         comida            23  0.142857
2    restaurante             8  0.049689
3       ambiente             7  0.043478
4         platos             6  0.037267
..           ...           ...       ...
67      cocinero             1  0.006211
68        tartar             1  0.006211
69       postres             1  0.006211
70  localización             1  0.006211
71         comer             1  0.006211

[72 rows x 3 columns]


In [10]:
def create_e_x_y_df_train_all_y_values(file_path):
    """
    Create a combined DataFrame of conditional probabilities 'e(x|y)' for all unique 'y' values in the provided dataset.

    Parameters:
    file_path (str): The file path to the dataset containing 'x' and 'y' values.

    Returns:
    DataFrame: A DataFrame containing calculated conditional probabilities for each 'x' value and unique 'y' value.
    """

    # Load the data from the specified file path into a DataFrame
    df_train = file_to_df(file_path)

    # Initialize an empty list to store DataFrames containing conditional probabilities
    ls_df_train = []

    # Create a list of all unique 'y' values in the DataFrame
    ls_y_values = create_ls_of_all_y_values(df_train)

    # Iterate through each unique 'y' value
    for y_value in ls_y_values:
        # Skip the loop iteration if y_value is None
        if y_value is not None:
            # Calculate conditional probabilities for the current 'y' value and add to the list
            df_e_x_y = create_df_e_x_y_train(df_train, y_value)
            # Append the conditional probability for that y value to the list
            ls_df_train.append(df_e_x_y)

    # Concatenate the list of DataFrames into a single DataFrame, combining the rows (stacking the DataFrames)
    combined_df_train = pd.concat(ls_df_train, axis=0)
    return combined_df_train

file_path_train_es = 'Data/ES/train'
print(create_e_x_y_df_train_all_y_values(file_path_train_es))

unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
          x  count_y_to_x  

In [11]:
def create_df_x_to_y_star(file_path):

    # Calculate conditional probabilities for all 'y' values using the training data
    e_x_y_df_train = create_e_x_y_df_train_all_y_values(file_path)

    # Group by 'x' and find the maximum 'e(x|y)' value for each group
    df_x_to_y_star = e_x_y_df_train.groupby('x')['e(x|y)'].max().reset_index()

    # Find the corresponding 'y' values for the maximum 'e(x|y)' values
    df_x_to_y_star = pd.merge(df_x_to_y_star, e_x_y_df_train, on=['x', 'e(x|y)'])

    # Rename the columns
    df_x_to_y_star.columns = ['x', 'max_e(x|y)', 'count_y_to_x', 'y_star']

    # Return the resulting DataFrame
    return df_x_to_y_star

file_path_train_es = 'Data/ES/train'
create_df_x_to_y_star(file_path_train_es)

unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}


Unnamed: 0,x,max_e(x|y),count_y_to_x,y_star
0,!,0.005442,158,O
1,"""",0.011696,2,I-negative
2,%,0.000413,12,O
3,(,0.004512,131,O
4,),0.004512,131,O
...,...,...,...,...
5035,“,0.003185,1,I-positive
5036,”,0.003185,1,I-positive
5037,…,0.002625,1,B-negative
5038,″,0.000034,1,O


In [12]:
def generate_y_values(file_path_dev_in, file_path_train, file_path_dev_p1_out):
    """
    Generate 'y' values using conditional probabilities and write the results to an output file.

    Parameters:
    file_path_dev_in (str): The file path to the input dataset containing 'x' values.
    file_path_train (str): The file path to the training dataset containing 'x' and 'y' values.
    file_path_dev_p1_out (str): The file path to write the generated 'y' values.

    Returns:
    None
    """
    
    # Generate a DataFrame using the create_df_x_to_y_star function
    df_train = create_df_x_to_y_star(file_path_train)

    # Convert the x values in df_train to a list
    x_values = df_train['x'].tolist()
    
    # The y_label that has the smallest count_y will generate the biggest k / count_y_plus_k,
    # which will give the maximum e(x|y), meaning that y_label is y_star
    y_label = y_star_with_smallest_count(df_train)
    
    # Open the input file for reading and use UTF-8 encoding
    with open(file_path_dev_in, 'r', encoding='utf-8') as file:

        # Read all lines from the file and store them in a list
        lines = file.readlines()

        # Iterate through each line in the list of lines
        for l in range(len(lines)):
            
            # Remove leading/trailing whitespace and store the current line
            line = lines[l].strip()

            # Check if the current line exists in the list of 'x_values'
            if line in x_values:

                # Get corresponding 'y_star' value(s) for the 'x' value in the current line
                possible_y_values = df_train[df_train['x'] == line]['y_star'].tolist()

                # Append the first 'y_star' value to the current line
                lines[l] = line + " " + possible_y_values[0]

                # Check if there are multiple 'y_star' values, indicating a potential issue
                if len(possible_y_values) != 1:
                    print("Something wrong: x_values in df_train not unique for some reason, for line:", l)
            
            # If the current line is not in 'x_values'
            else:
                 
                # Check if the line is not empty
                if line != "\n":

                    # Append the default 'y_label' to the current line
                    line = line + " " + y_label

                    # Update the current line in the list
                    lines[l] = line
    
    # Open the output file for writing using UTF-8 encoding
    with open(file_path_dev_p1_out, 'w', encoding='utf-8') as file:
        
        # Write each modified line back to the output file
        for line in lines:
            print(line)
            if line != "\n":
                file.write(line + '\n')
            else:
                file.write(line)


file_path_dev_in_es = 'Data/ES/dev.in'
file_path_train_es = 'Data/ES/train'
file_path_dev_p1_out_es = 'Data/ES/dev.p1.out'
generate_y_values(file_path_dev_in_es, file_path_train_es, file_path_dev_p1_out_es)

unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
unique counts: {'O': 29035, 'B-positive': 1160, 'B-negative': 381, 'I-positive': 314, 'I-negative': 171, 'B-neutral': 72, 'I-neutral': 43}
Plato B-negative
degustació

In [13]:
file_path_dev_p1_out_es = 'Data/ES/dev.p1.out'
file_dev_out_es = 'Data/ES/dev.out'
df_dev_p1_out_es = file_to_df(file_path_dev_p1_out_es)
df_dev_out_es = file_to_df(file_dev_out_es)
print(df_dev_p1_out_es.shape[0])
print(df_dev_out_es.shape[0])

df1 = df_dev_p1_out_es
df2 = df_dev_out_es

# Compare rows
comparison = df1.equals(df2)

# Get the count of rows that are the same
count_same = df1[df1.eq(df2).all(axis=1)].shape[0]

# Get the count of rows that are not the same
count_not_same = df1[df1.ne(df2).any(axis=1)].shape[0]

# Get the rows that are not the same
rows_not_same = df1[~df1.eq(df2).all(axis=1)]

total_number_of_correctly_predicted_entries = count_same
total_number_of_predicted_entities = df1.shape[0]
precision = total_number_of_correctly_predicted_entries / total_number_of_predicted_entities
print(precision)

# Display the results
# print("Comparison result:", comparison)
# print("Count of rows that are the same:", count_same)
# print("Count of rows that are not the same:", count_not_same)
# print("Rows that are not the same:\n", rows_not_same)

4312
4312
0.6307977736549165


# Part 2

## ES Dataset

In [14]:
import pandas as pd

# 1. Extracting data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')
    return [sentence.split('\n') for sentence in data]

# Adjusting the paths for ES and RU datasets
train_data = load_data('Data/ES/train')
dev_in_data = load_data('Data/ES/dev.in')
with open('Data/ES/dev.out', 'r', encoding='utf-8') as f:
    dev_tags_actual = [sentence.split() for sentence in f.read().strip().split('\n\n')]

def compute_probabilities(data, state_list):
    # Initialize Count Dictionaries
    start_transition_count = {state: 0 for state in state_list}
    transition_count = {state: {state2: 0 for state2 in state_list} for state in state_list}
    emission_count = {state: {} for state in state_list}
    state_count = {state: 0 for state in state_list}

    for sentence in data:
        prev_state = None
        for line in sentence:
            word, state = line.strip().split()
            
            # Update start_transition_count for the first word-state pair in a sentence
            if prev_state is None:
                start_transition_count[state] += 1
            else:
                # Update transition_count for subsequent word-state pairs
                transition_count[prev_state][state] += 1

            # Update state_count and emission_count for every word-state pair
            state_count[state] += 1
            emission_count[state][word] = emission_count[state].get(word, 0) + 1
            prev_state = state

    total_sentences = len(data)
    
    # Compute Probabilities
    start_transition_prob = {state: count / total_sentences for state, count in start_transition_count.items()}
    transition_prob = {state: {state2: count2 / state_count[state] for state2, count2 in count.items()} for state, count in transition_count.items()}
    emission_prob = {state: {word: count / state_count[state] for word, count in state_emission_count.items()} for state, state_emission_count in emission_count.items()}

    return start_transition_prob, transition_prob, emission_prob


def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    
    # Initialization step
    for st in states:
        V[0][st] = {"prob": start_p.get(st, 0) * emit_p[st].get(obs[0], 0.00001), "prev": None}
    
    # Recursive step
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            # Get max transition probability and corresponding previous state in one go
            max_trans_prob, prev_st_selected = max(
                (V[t-1][prev_st]["prob"] * trans_p[prev_st].get(st, 0.00001), prev_st) for prev_st in states
            )
            
            max_prob = max_trans_prob * emit_p[st].get(obs[t], 0.00001)  # Added a small value to avoid multiplying by 0
            V[t][st] = {"prob": max_prob, "prev": prev_st_selected}
    
    # Termination step
    opt = []
    max_prob, previous = max((value["prob"], st) for st, value in V[-1].items())
    opt.append(previous)

    # Backtrack step
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]
    
    return opt



def extract_entities_from_tags(tags):
    entities = []
    entity = []
    for tag in tags:
        if tag.startswith("B-"):
            if entity:
                entities.append(tuple(entity))
                entity = []
            entity.append(tag)
        elif tag.startswith("I-"):
            entity.append(tag)
        else:
            if entity:
                entities.append(tuple(entity))
                entity = []
    if entity:
        entities.append(tuple(entity))
    return set(entities)

TP = 0
FP = 0
FN = 0

states = {}
observations = {}

for sentence in train_data:
    for line in sentence:
        word, tag = line.strip().split()
        states[tag] = states.get(tag, 0) + 1
        if tag not in observations:
            observations[tag] = {}
        observations[tag][word] = observations[tag].get(word, 0) + 1

state_list = list(states.keys())

state_list = list(states.keys())
start_transition_prob, transition_prob, emission_prob = compute_probabilities(train_data, state_list)
predicted_tags_viterbi = [viterbi([word.split()[0] for word in sentence], state_list, start_transition_prob, transition_prob, emission_prob) for sentence in dev_in_data]

for pred, actual in zip(predicted_tags_viterbi, dev_tags_actual):
    predicted_entities = extract_entities_from_tags(pred)
    actual_entities = extract_entities_from_tags(actual)
    TP += len(predicted_entities.intersection(actual_entities))
    FP += len(predicted_entities - actual_entities)
    FN += len(actual_entities - predicted_entities)

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f_score = 2 * precision * recall / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F-score:", f_score)

Precision: 0.5517241379310345
Recall: 0.42780748663101603
F-score: 0.4819277108433735


In [15]:
predicted_tags_viterbi

[['B-negative',
  'I-negative',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-positive',
  'I-positive',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-positive',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'B-positive', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-positive',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'B-positive',
  'O',
  'O',
  'O',
  'O',
  'B-positive',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-negative',
  'I-negative',
  'I-negative',
  'I-negative',
  'I-negative',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'B-positive', 'I-positiv

In [16]:
dev_in_data

[['Plato',
  'degustación',
  ':',
  'un',
  'poco',
  'abundante',
  'de',
  'más',
  ',',
  'pero',
  'bien',
  'cocinado',
  '.'],
 ['restaurante', 'excelente', 'con', 'carne', 'de', 'alta', 'calidad', '.'],
 ['Las',
  'posibilidades',
  'en',
  'el',
  'restaurante',
  'son',
  'fundamentalmente',
  'tres',
  ';',
  'carta',
  'normal',
  ',',
  'menú',
  'degustacion',
  'y',
  'una',
  'opción',
  'intermedia',
  'que',
  'es',
  'una',
  'selección',
  'de',
  'primeros',
  'y',
  'postres',
  'y',
  'carta',
  'para',
  'el',
  'segundo',
  '.'],
 ['No', 'perderse', 'el', 'sorbete', 'de', 'mojito', '.'],
 ['para', 'mi', 'perfecto', '!'],
 ['Devolucion',
  'a',
  'cocina',
  ',',
  'amabilidad',
  'de',
  'camarera',
  ',',
  'requerimiento',
  'de',
  'cuenta',
  'y',
  'adios',
  '.'],
 ['Así',
  'como',
  'el',
  'romesco',
  ',',
  'que',
  'era',
  'un',
  'poco',
  '"',
  'de',
  'bote',
  '"',
  '.'],
 ['Destacar',
  'los',
  'arroces',
  ',',
  'la',
  'caldereta',
  'de

In [17]:
## predicted = []
file_path_dev_p2_out = 'Data/ES/dev.p2.out'

# Open the output file for writing using UTF-8 encoding
with open(file_path_dev_p2_out, 'w', encoding='utf-8') as file:
    for s in range(len(dev_in_data)):
        for wi in range(len(dev_in_data[s])):
            line = dev_in_data[s][wi]+" "+predicted_tags_viterbi[s][wi]
            if line != "\n":
                file.write(line + '\n')
            else:
                file.write(line)
        file.write('\n')


# Part 4

We can modify the approach by using an n-th order HMM, whereby the HMM can depend on not only the current state but also the previous n-states. In this case, we have adopted n=2.

In [75]:
#set to RU/ES
lang = 'RU'
#set to test/dev
flag = 'test'


if(flag=='test'):
    dev_in_data = load_data(f'Test/{lang}/{flag}.in')
    dev_out_data = f'Test/{lang}/{flag}.p4.out'
    

elif(flag=='dev'):
    dev_in_data = load_data(f'Data/{lang}/{flag}.in')
    dev_out_data = f'Data/{lang}/{flag}.p4.out'
    with open(f'Data/{lang}/dev.out', 'r', encoding='utf-8') as f:
        dev_tags_actual = [sentence.split() for sentence in f.read().strip().split('\n\n')]

# Adjusting the paths for ES and RU datasets
train_data = load_data(f'Data/{lang}/train')





In [76]:
states = {}
observations = {}

for sentence in train_data:
    for line in sentence:
        word, tag = line.strip().split(maxsplit=1)
        states[tag] = states.get(tag, 0) + 1
        if tag not in observations:
            observations[tag] = {}
        observations[tag][word] = observations[tag].get(word, 0) + 1

state_list = list(states.keys())

In [77]:
from collections import defaultdict

def compute_probabilities(data, state_list):
    # Initialize counts
    start_count = defaultdict(int)
    transition_count = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    emission_count = defaultdict(lambda: defaultdict(int))

    # Populate counts
    for sentence in data:
        if len(sentence) < 3:  # Skip sentences that are too short
            continue

        # Splitting and extracting the word and state for first two words/states
        word1, state1 = sentence[0].strip().split()
        word2, state2 = sentence[1].strip().split()
        start_count[state1] += 1
        emission_count[state1][word1] += 1
        emission_count[state2][word2] += 1

        # Rest of the sentence
        for i in range(2, len(sentence)):
            word3, state3 = sentence[i].strip().split(maxsplit=1)
            transition_count[state1][state2][state3] += 1
            emission_count[state3][word3] += 1
            state1, state2 = state2, state3

    # Convert counts to probabilities
    # Start transition probabilities
    start_transition_prob = {state: count / sum(start_count.values()) for state, count in start_count.items()}

    # Transition probabilities
    transition_prob = {}
    for s1, s1_dict in transition_count.items():
        transition_prob[s1] = {}
        for s2, s2_dict in s1_dict.items():
            transition_prob[s1][s2] = {}
            for s3, count in s2_dict.items():
                total = sum(s2_dict.values())
                transition_prob[s1][s2][s3] = count / total if total != 0 else 0.0

    # Emission probabilities
    emission_prob = {}
    for state, word_count in emission_count.items():
        emission_prob[state] = {}
        total = sum(word_count.values())
        for word, count in word_count.items():
            emission_prob[state][word] = count / total

    return start_transition_prob, transition_prob, emission_prob


In [78]:
state_list = list(states.keys())
start_transition_prob, transition_prob, emission_prob = compute_probabilities(train_data, state_list)

In [79]:
transition_prob

{'B-positive': {'O': {'O': 0.9153963414634146,
   'B-positive': 0.08307926829268293,
   'B-neutral': 0.001524390243902439},
  'I-positive': {'O': 0.6131805157593123,
   'I-positive': 0.3839541547277937,
   'B-positive': 0.0028653295128939827},
  'B-positive': {'O': 1.0}},
 'O': {'O': {'O': 0.948982402448355,
   'B-positive': 0.03727620504973221,
   'B-negative': 0.010130068859984697,
   'B-neutral': 0.003550114766641163,
   '.. O': 3.0604437643458305e-05,
   '... O': 3.0604437643458305e-05},
  'B-positive': {'I-positive': 0.20557029177718833,
   'O': 0.7924403183023873,
   'B-positive': 0.001989389920424403},
  'B-negative': {'I-negative': 0.1931216931216931, 'O': 0.8068783068783069},
  'B-neutral': {'O': 0.8591549295774648, 'I-neutral': 0.14084507042253522},
  '.. O': {'O': 1.0},
  '... O': {'O': 1.0}},
 'I-positive': {'O': {'O': 0.8397212543554007,
   'B-positive': 0.1602787456445993},
  'I-positive': {'O': 0.5425101214574899, 'I-positive': 0.4574898785425101},
  'B-positive': {'I-po

In [84]:
def viterbi(obs, states, start_p, trans_p, emit_p):
    # Initialize the Viterbi matrix. 
    V = [{}]

    # Initialize the first column of the matrix with the start probabilities
    for st in states:
        V[0][st] = {"prob": start_p.get(st, 0) * emit_p[st].get(obs[0], 0), "prev": None}

    # Main loop through the observations updating the Viterbi matrix
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            # For each state, find the maximum transition probability 
            # considering all possible previous state combinations.
            max_trans_prob, prev_st1_max, prev_st2_max = max(
                (V[t-1][prev_st1]["prob"] * trans_p[prev_st1].get(prev_st2, {}).get(st, 0), prev_st1, prev_st2)
                for prev_st1 in states for prev_st2 in states
            )

            # Multiply the max transition probability with emission probability
            max_prob = max_trans_prob * emit_p[st].get(obs[t], 0)

            # Store the maximum probability and previous state information
            V[t][st] = {"prob": max_prob, "prev": (prev_st1_max, prev_st2_max)}

    # Now, backtrack to find the most probable sequence of states
    opt = []

    # Find the state with the maximum probability for the last observation
    max_prob = max(value["prob"] for value in V[-1].values())
    previous = None

    for st, data in V[-1].items():
        if data["prob"] == max_prob:
            opt.append(st)
            previous = st
            break

    # Backtrack through the Viterbi matrix to find the sequence of states
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"][1])
        previous = V[t + 1][previous]["prev"][1]

    # Return the most probable sequence of states
    return opt


In [85]:
predicted_tags_viterbi = [viterbi([word.split()[0] for word in sentence], state_list, start_transition_prob, transition_prob, emission_prob) for sentence in dev_in_data]


In [86]:
def extract_entities_from_tags(tags):
    entities = []
    entity = []
    for tag in tags:
        if tag.startswith("B-"):
            if entity:
                entities.append(tuple(entity))
                entity = []
            entity.append(tag)
        elif tag.startswith("I-"):
            entity.append(tag)
        else:
            if entity:
                entities.append(tuple(entity))
                entity = []
    if entity:
        entities.append(tuple(entity))
    return set(entities)

TP = 0
FP = 0
FN = 0


if(flag=='dev'):
    for pred, actual in zip(predicted_tags_viterbi, dev_tags_actual):
        predicted_entities = extract_entities_from_tags(pred)
        actual_entities = extract_entities_from_tags(actual)
        TP += len(predicted_entities.intersection(actual_entities))
        FP += len(predicted_entities - actual_entities)
        FN += len(actual_entities - predicted_entities)

    if TP + FP == 0:
        precision = 1.0  # or 0.0, depending on how you want to define it in this case
    else:
        precision = TP / (TP + FP)

    if TP + FN == 0:
        recall = 1.0  # or 0.0
    else:
        recall = TP / (TP + FN)

    if precision + recall == 0:
        f_score = 0.0
    else:
        f_score = 2 * precision * recall / (precision + recall)


    print("Precision:", precision)
    print("Recall:", recall)
    print("F-score:", f_score)

In [87]:
## predicted = []

# Open the output file for writing using UTF-8 encoding
with open(dev_out_data, 'w', encoding='utf-8') as file:
    for s in range(len(dev_in_data)):
        for wi in range(len(dev_in_data[s])):
            line = dev_in_data[s][wi]+" "+predicted_tags_viterbi[s][wi]
            if line != "\n":
                file.write(line + '\n')
            else:
                file.write(line)
        file.write('\n')