# Part 1

In [18]:
import pandas as pd

In [32]:
def file_to_df(file_path):
    """
    Read data from a file, process it, and create a Pandas DataFrame.

    Parameters:
    file_path (str): The path to the file containing the data.

    Returns:
    DataFrame: A DataFrame containing processed data with columns 'x' and 'y'.
    """

    # Opens the file using file_path function. 
    with open(file_path, 'r', encoding='utf-8') as file:
        # Reads lines of file using readlines()
        lines = file.readlines()
        # So, the entire expression takes each line, removes leading/trailing whitespace,
        # splits it into parts based on spaces (up to 2 splits), and then extracts the first two parts.
        # The result is a list of pairs of values from each line in the lines list. 
        data = [line.strip().split(' ', maxsplit=2)[:2] for line in lines]

    # Creating dataframe from the data list with 2 columns, x and y.
    df = pd.DataFrame(data, columns=['x', 'y'])

    # Drop rows where value for y = None 
    df = df.dropna(subset=['y'])

    # Convert the data type of columns 'x' and 'y' to strings
    df['x'] = df['x'].astype(str)
    df['y'] = df['y'].astype(str)

    # Display the dataframe
    return df

# file_to_df function called with the file path
file_path_train_es = "Data/ES/train"
# Stores resulting dataframe in df_train_es
df_train_es = file_to_df(file_path_train_es)

FileNotFoundError: [Errno 2] No such file or directory: 'Data/RU/train'

In [12]:
def count_y(df, y_value):
    """
    Count the occurrences of a specific value in the 'y' column of a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    y_value: The value for which the count is to be determined.

    Returns:
    int: The count of occurrences of the specified y_value.
    """

    # Calculate the count of each unique value in the 'y' column and convert it to a dictionary.
    unique_counts = df['y'].value_counts().to_dict()

    # Return the count of occurrences of the specified y_value
    print("unique counts:", unique_counts)
    return unique_counts[y_value]

In [13]:
def y_star_with_smallest_count(df):
    """
    Find the unique value(s) in the 'y_star' column of a DataFrame that have the smallest count.

    Parameters:
    df (DataFrame): The DataFrame containing the data.

    Returns:
    The unique value(s) in the 'y_star' column with the smallest count.
    """

    # Calculate the count of each unique value in the 'y_star' column and convert it to a dictionary.
    unique_counts = df['y_star'].value_counts().to_dict()
    
    # Find the smallest count value among the unique counts.
    min_count = min(unique_counts.values())

    # Construct a list of unique value(s) with the smallest count.
    y_with_min_count = [key for key, value in unique_counts.items() if value == min_count]

    # Return the first value from the list (if there are multiple values with the smallest count).
    return y_with_min_count[0]

In [19]:
def create_df_filtered_for_y_value(df, y_value):
    """
    Create a filtered DataFrame containing rows where the 'y' column matches a specific value.

    Parameters:
    df (DataFrame): The original DataFrame containing the data.
    y_value: The value to filter rows based on in the 'y' column.

    Returns:
    DataFrame: A new DataFrame with rows filtered for the specified y_value in the 'y' column.
    """

    return df[df['y'] == y_value]

In [20]:
def create_df_x_count_y_to_x(df):
    """
    Create a DataFrame that shows the count of occurrences of each 'x' value in the original DataFrame.

    Parameters:
    df (DataFrame): The original DataFrame containing the data.

    Returns:
    DataFrame: A new DataFrame with 'x' values and their corresponding counts.
    """

    # Calculate the count of occurrences of each 'x' value and reset the index.
    df_x_count_y_to_x = df['x'].value_counts().reset_index()

    # Rename columns to 'x' and 'count_y_to_x' for clarity.
    df_x_count_y_to_x.columns = ['x', 'count_y_to_x']

    return df_x_count_y_to_x

In [21]:
def create_ls_of_all_y_values(df):
    """
    Create a list of all unique values in the 'y' column of the DataFrame.

    Parameters:
    df (DataFrame): The DataFrame containing the data.

    Returns:
    list: A list containing all unique values in the 'y' column.
    """

    # Retrieve the unique values from the 'y' column.
    unique_values = df['y'].unique()

    # Return the list of unique 'y' values.
    return unique_values
    
print(create_ls_of_all_y_values(df_train_es))

NameError: name 'df_train_es' is not defined

In [None]:
def create_df_e_x_y_train(train_df, y_value):
    """
    Create a DataFrame containing conditional probabilities and 'y' values for specific conditions.

    Parameters:
    train_df (DataFrame): The training DataFrame containing the data.
    y_value: The specific 'y' value for which conditional probabilities are calculated.

    Returns:
    DataFrame: A new DataFrame with conditional probabilities and 'y' values.
    """

    # Filter the training DataFrame for the specified y_value.
    df_train_filtered_for_y = create_df_filtered_for_y_value(train_df, y_value)

    # Create a DataFrame showing the count of occurrences of each 'x' value for the specified y_value.
    df_e_x_y_train = create_df_x_count_y_to_x(df_train_filtered_for_y)

    # Conditional probability formula - P(x|y) = P(x ∩ y) / P(y)
    # Calculate conditional probabilities 'e(x|y)' by dividing
    # the count of occurrences of each 'x' value for the specific 'y' value.
    # by the count of occurrences of the specific y_value
    df_e_x_y_train['e(x|y)'] = df_e_x_y_train['count_y_to_x'] / (count_y(train_df, y_value))

    # Add 'y' column to the DataFrame with the specified y_value.
    df_e_x_y_train['y'] = y_value
    
    return df_e_x_y_train

# Create a DataFrame containing conditional probabilities for a specific 'y' value.
df_e_x_y_train_for_I_neutral = create_df_e_x_y_train(df_train_es, "B-positive")

# Print the DataFrame showing conditional probabilities.
print(df_e_x_y_train_for_I_neutral)

# Calculate and print the sum of conditional probabilities from the DataFrame. 
# Should add to 1 according to the characteristics of a probability distribution.
print(df_e_x_y_train_for_I_neutral['e(x|y)'].sum())

: 

In [None]:
def create_df_e_x_y_test(test_df, y_value, train_df):
    """
    Create a DataFrame containing conditional probabilities 'e(x|y)' for a specific 'y' value using test and training data.

    Parameters:
    test_df (DataFrame): The test DataFrame containing the data to be evaluated.
    y_value: The specific 'y' value for which conditional probabilities are calculated.
    train_df (DataFrame): The training DataFrame containing the data used for reference.

    Returns:
    DataFrame: A new DataFrame with 'x' values, calculated conditional probabilities 'e(x|y)', and a sum of probabilities.
    """

    # Initializing k = 1
    k = 1

    # Filtering the test DataFrame for the specified 'y' value
    df_test_filtered_for_y = create_df_filtered_for_y_value(test_df, y_value)

    # Creating a DataFrame with count of occurrences of each 'x' value for the specified 'y' value
    # for the filtered test DataFrame
    df_e_x_y_test = create_df_x_count_y_to_x(df_test_filtered_for_y)

    # Calculating the total count of 'y' values plus a constant k (1)
    count_y_plus_k = count_y(test_df, y_value) + k

    # Extracting the 'x' values from the training DataFrame
    train_df_x_values = train_df['x'].tolist()

    # Handling unknown 'x' values by replacing them with "#UNK#" if not present in the training set
    df_e_x_y_test['x'] = df_e_x_y_test['x'].apply(lambda x: x if x in train_df_x_values else "#UNK#")

    # Calculating conditional probabilities 'e(x|y)' based on the 'x' values
    df_e_x_y_test['e(x|y)'] = df_e_x_y_test.apply(lambda row: (row['count_y_to_x'] / count_y_plus_k) if row['x'] != '#UNK#' else (k / count_y_plus_k), axis=1)

    # Printing the sum of conditional probabilities
    # Should add to 1 according to the characteristics of the probability distribution
    print(df_e_x_y_test['e(x|y)'].sum())

    # Returning the DataFrame containing calculated conditional probabilities
    return df_e_x_y_test

# Define the file path for the test data
file_path_test_es = 'Data/ES/dev.out'

# Load the test data from the specified file path into a DataFrame
df_test_es = file_to_df(file_path_test_es)

# Create a DataFrame containing conditional probabilities for a specific 'y' value using test and training data
df_e_x_y_test_for_I_neutral = create_df_e_x_y_test(df_test_es, "B-positive", df_train_es)

# Print the DataFrame showing calculated conditional probabilities
print(df_e_x_y_test_for_I_neutral)

: 

In [None]:
def create_e_x_y_df_train_all_y_values(file_path):
    """
    Create a combined DataFrame of conditional probabilities 'e(x|y)' for all unique 'y' values in the provided dataset.

    Parameters:
    file_path (str): The file path to the dataset containing 'x' and 'y' values.

    Returns:
    DataFrame: A DataFrame containing calculated conditional probabilities for each 'x' value and unique 'y' value.
    """

    # Load the data from the specified file path into a DataFrame
    df_train = file_to_df(file_path)

    # Initialize an empty list to store DataFrames containing conditional probabilities
    ls_df_train = []

    # Create a list of all unique 'y' values in the DataFrame
    ls_y_values = create_ls_of_all_y_values(df_train)

    # Iterate through each unique 'y' value
    for y_value in ls_y_values:
        # Skip the loop iteration if y_value is None
        if y_value is not None:
            # Calculate conditional probabilities for the current 'y' value and add to the list
            df_e_x_y = create_df_e_x_y_train(df_train, y_value)
            # Append the conditional probability for that y value to the list
            ls_df_train.append(df_e_x_y)

    # Concatenate the list of DataFrames into a single DataFrame, combining the rows (stacking the DataFrames)
    combined_df_train = pd.concat(ls_df_train, axis=0)
    return combined_df_train

file_path_train_es = 'Data/ES/train'
print(create_e_x_y_df_train_all_y_values(file_path_train_es))

: 

In [None]:
def create_df_x_to_y_star(file_path):

    # Calculate conditional probabilities for all 'y' values using the training data
    e_x_y_df_train = create_e_x_y_df_train_all_y_values(file_path)

    # Group by 'x' and find the maximum 'e(x|y)' value for each group
    df_x_to_y_star = e_x_y_df_train.groupby('x')['e(x|y)'].max().reset_index()

    # Find the corresponding 'y' values for the maximum 'e(x|y)' values
    df_x_to_y_star = pd.merge(df_x_to_y_star, e_x_y_df_train, on=['x', 'e(x|y)'])

    # Rename the columns
    df_x_to_y_star.columns = ['x', 'max_e(x|y)', 'count_y_to_x', 'y_star']

    # Return the resulting DataFrame
    return df_x_to_y_star

file_path_train_es = 'Data/ES/train'
create_df_x_to_y_star(file_path_train_es)

: 

In [None]:
def generate_y_values(file_path_dev_in, file_path_train, file_path_dev_p1_out):
    """
    Generate 'y' values using conditional probabilities and write the results to an output file.

    Parameters:
    file_path_dev_in (str): The file path to the input dataset containing 'x' values.
    file_path_train (str): The file path to the training dataset containing 'x' and 'y' values.
    file_path_dev_p1_out (str): The file path to write the generated 'y' values.

    Returns:
    None
    """
    
    # Generate a DataFrame using the create_df_x_to_y_star function
    df_train = create_df_x_to_y_star(file_path_train)

    # Convert the x values in df_train to a list
    x_values = df_train['x'].tolist()
    
    # The y_label that has the smallest count_y will generate the biggest k / count_y_plus_k,
    # which will give the maximum e(x|y), meaning that y_label is y_star
    y_label = y_star_with_smallest_count(df_train)
    
    # Open the input file for reading and use UTF-8 encoding
    with open(file_path_dev_in, 'r', encoding='utf-8') as file:

        # Read all lines from the file and store them in a list
        lines = file.readlines()

        # Iterate through each line in the list of lines
        for l in range(len(lines)):

            # Remove leading/trailing whitespace and store the current line
            line = lines[l].strip()

            # Check if the current line exists in the list of 'x_values'
            if line in x_values:

                # Get corresponding 'y_star' value(s) for the 'x' value in the current line
                possible_y_values = df_train[df_train['x'] == line]['y_star'].tolist()

                # Append the first 'y_star' value to the current line
                lines[l] = line + " " + possible_y_values[0]

                # Check if there are multiple 'y_star' values, indicating a potential issue
                if len(possible_y_values) != 1:
                    print("Something wrong: x_values in df_train not unique for some reason, for line:", l)
            
            # If the current line is not in 'x_values'
            else:
                 
                # Check if the line is not empty
                if line != "\n":

                    # Append the default 'y_label' to the current line
                    line = line + " " + y_label

                    # Update the current line in the list
                    lines[l] = line
    
    # Open the output file for writing using UTF-8 encoding
    with open(file_path_dev_p1_out, 'w', encoding='utf-8') as file:
        
        # Write each modified line back to the output file
        for line in lines:
            if line != "\n":
                file.write(line + '\n')
            else:
                file.write(line)


file_path_dev_in_es = 'Data/ES/dev.in'
file_path_train_es = 'Data/ES/train'
file_path_dev_p1_out_es = 'Data/ES/dev.p1.out'
generate_y_values(file_path_dev_in_es, file_path_train_es, file_path_dev_p1_out_es)

: 

In [None]:
file_path_dev_p1_out_es = 'Data/ES/dev.p1.out'
file_dev_out_es = 'Data/ES/dev.out'
df_dev_p1_out_es = file_to_df(file_path_dev_p1_out_es)
df_dev_out_es = file_to_df(file_dev_out_es)
print(df_dev_p1_out_es.shape[0])
print(df_dev_out_es.shape[0])

df1 = df_dev_p1_out_es
df2 = df_dev_out_es

# Compare rows
comparison = df1.equals(df2)

# Get the count of rows that are the same
count_same = df1[df1.eq(df2).all(axis=1)].shape[0]

# Get the count of rows that are not the same
count_not_same = df1[df1.ne(df2).any(axis=1)].shape[0]

# Get the rows that are not the same
rows_not_same = df1[~df1.eq(df2).all(axis=1)]

total_number_of_correctly_predicted_entries = count_same
total_number_of_predicted_entities = df1.shape[0]
precision = total_number_of_correctly_predicted_entries / total_number_of_predicted_entities
print(precision)

# Display the results
# print("Comparison result:", comparison)
# print("Count of rows that are the same:", count_same)
# print("Count of rows that are not the same:", count_not_same)
# print("Rows that are not the same:\n", rows_not_same)

: 

# Part 2

## ES Dataset

In [None]:
# Working for both ES and RU
import pandas as pd
import re

def load_dev_in_data(file_path):
    """Specific function to load dev.in data, which only contains words."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')
    return [[word for word in sentence.split('\n') if word.strip()] for sentence in data]

def load_data_modified_v7(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')
    # Filter out empty lines within sentences and handle lines with extra spaces
    processed_data = []
    for sentence in data:
        processed_sentence = []
        for line in sentence.split('\n'):
            if line.strip():  # Check if line is not empty
                match = re.search(r'^(.*)\s(\S+)$', line)
                if match:
                    word, tag = match.groups()
                    processed_sentence.append(f"{word} {tag}")
        # Add the processed sentence to the data only if it's not empty
        if processed_sentence:
            processed_data.append(processed_sentence)
    return processed_data

def viterbi(obs, states, start_p, trans_p, emit_p):
    # Initialize a list to store information about the most likely states at each time step
    V = [{}]
    
    # Initialization step: Calculate initial probabilities for each state based on the first observation
    for st in states:
        # Store the probability and mark that it has no previous state (since it's the first step)
        V[0][st] = {"prob": start_p.get(st, 0) * emit_p[st].get(obs[0], 0), "prev": None}
        
        
    # Loop through the observations and calculate the most likely sequence of states
    for t in range(1, len(obs)):
        # Add a new dictionary for the current time step
        V.append({})
        for st in states:
            # Calculate the maximum transition probability from a previous state to the current state
            max_trans_prob = max(V[t-1][prev_st]["prob"] * trans_p[prev_st].get(st, 0) for prev_st in states)
            
            # Find the previous state that resulted in the maximum transition probability
            for prev_st in states:
                if V[t-1][prev_st]["prob"] * trans_p[prev_st].get(st, 0) == max_trans_prob:
                    # Calculate the maximum probability for the current state based on emission probabilities
                    max_prob = max_trans_prob * emit_p[st].get(obs[t], 0)
        
                    # Store the maximum probability and the reference to the previous state
                    V[t][st] = {"prob": max_prob, "prev": prev_st}
                    break
                    
    # Backtrack to find the most likely sequence of states
    opt = []
    
    # Find the maximum probability in the last time step
    max_prob = max(value["prob"] for value in V[-1].values())
    
    previous = None
    # Iterate through the states in the last time step to find the state with maximum probability
    for st, data in V[-1].items():
        # If the probability matches the maximum probability in the last time step:
        if data["prob"] == max_prob:
            # Add the current state to the optimal sequence
            opt.append(st)
            # Update the 'previous' variable with the current state
            previous = st
            # Stop the loop since we found the state with the maximum probability
            break
            
    # Iterate backwards through time steps starting from the second-to-last
    for t in range(len(V) - 2, -1, -1):
        # Insert the previous state that led to the current state at the beginning of the 'opt' list
        opt.insert(0, V[t + 1][previous]["prev"])
        
        # Update the 'previous' state with the value of the previous state from the current time step
        previous = V[t + 1][previous]["prev"]

    return opt

def compute_probabilities_v2(data, state_list):
    start_transition_count = {state: 0 for state in state_list}
    transition_count = {state: {state2: 0 for state2 in state_list} for state in state_list}
    emission_count = {state: {} for state in state_list}
    state_count = {state: 0 for state in state_list}

    for sentence in data:
        prev_state = None
        for line in sentence:
            match = re.search(r'^(.*)\s(\S+)$', line.strip())
            if match:
                word, state = match.groups()
                if prev_state is None:
                    start_transition_count[state] += 1
                else:
                    transition_count[prev_state][state] += 1
                    emission_count[prev_state][word] = emission_count[prev_state].get(word, 0) + 1
                state_count[state] += 1
                prev_state = state
        if prev_state:
            emission_count[prev_state][word] = emission_count[prev_state].get(word, 0) + 1

    total_sentences = len(data)
    start_transition_prob = {state: count / total_sentences for state, count in start_transition_count.items()}
    transition_prob = {state: {state2: count2 / state_count[state] for state2, count2 in count.items()} for state, count in transition_count.items()}
    emission_prob = {state: {word: count / state_count[state] for word, count in state_emission_count.items()} for state, state_emission_count in emission_count.items()}

    return start_transition_prob, transition_prob, emission_prob

def extract_entities_from_tags(tags):
    entities = []
    entity = []
    for tag in tags:
        if tag.startswith("B-"):
            if entity:
                entities.append(tuple(entity))
                entity = []
            entity.append(tag)
        elif tag.startswith("I-"):
            entity.append(tag)
        else:
            if entity:
                entities.append(tuple(entity))
                entity = []
    if entity:
        entities.append(tuple(entity))
    return set(entities)

# Modify the process_dataset function to use the updated compute_probabilities function
def process_dataset_final_v5(dataset_type):
    # Adjusting the paths dynamically based on dataset type
    train_path = f"Data/{dataset_type}/train"
    dev_in_path = f"Data/{dataset_type}/dev.in"
    dev_out_path = f"Data/{dataset_type}/dev.out"
    
    train_data = load_data_modified_v7(train_path)
    dev_in_data = load_dev_in_data(dev_in_path)
    with open(dev_out_path, 'r', encoding='utf-8') as f:
        dev_tags_actual = [sentence.split() for sentence in f.read().strip().split('\n\n')]
    
    states = {}
    observations = {}

    for sentence in train_data:
        for line in sentence:
            match = re.search(r'^(.*)\s(\S+)$', line.strip())
            if match:
                word, tag = match.groups()
                states[tag] = states.get(tag, 0) + 1
                if tag not in observations:
                    observations[tag] = {}
                observations[tag][word] = observations[tag].get(word, 0) + 1

    state_list = list(states.keys())
    start_transition_prob, transition_prob, emission_prob = compute_probabilities_v2(train_data, state_list)
    predicted_tags_viterbi = [viterbi([word for word in sentence], state_list, start_transition_prob, transition_prob, emission_prob) for sentence in dev_in_data]
    
#     print(predicted_tags_viterbi)

    TP = 0
    FP = 0
    FN = 0

    for pred, actual in zip(predicted_tags_viterbi, dev_tags_actual):
        predicted_entities = extract_entities_from_tags(pred)
        actual_entities = extract_entities_from_tags(actual)
        TP += len(predicted_entities.intersection(actual_entities))
        FP += len(predicted_entities - actual_entities)
        FN += len(actual_entities - predicted_entities)

    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
    
#     predicted = []
#     for s in range(len(predicted_tags_viterbi)):
#         for i in range(len(predicted_tags_viterbi[s])):
#             predicted.append(dev_in_data[s][i] + " "+ predicted_tags_viterbi[s][i])
#         predicted.append('\n')
        
#     # Open the output file for writing using UTF-8 encoding
#     file_path_dev_p2_out = 'Data/ES/dev.p1.out'
#     with open(file_path_dev_p2_out, 'w', encoding='utf-8') as file:
#         # Write each modified line back to the output file
#         for line in predicted:
#             if line != "\n":
#                 file.write(line + '\n')
#             else:
#                 file.write(line)

# process_dataset_final_v5("ES")
    return precision, recall, f_score

# Process the datasets using the latest modifications
es_results_final_v10 = process_dataset_final_v5("ES")
ru_results_final_v10 = process_dataset_final_v5("RU")

es_results_final_v10, ru_results_final_v10

# Print the precision, recall, and F-score for the "ES" dataset
print("ES Dataset Results:")
print("Precision:", es_results_final_v10[0])
print("Recall:", es_results_final_v10[1])
print("F-score:", es_results_final_v10[2])

# Print the precision, recall, and F-score for the "RU" dataset
print("\nRU Dataset Results:")
print("Precision:", ru_results_final_v10[0])
print("Recall:", ru_results_final_v10[1])
print("F-score:", ru_results_final_v10[2])

: 

In [None]:
import numpy as np
import math
import copy

sr_start_probabilities_es = start_transition_prob
df_transition_es = transition_prob

sr_start_probabilities_ru = start_transition_prob
df_transition_ru = transition_prob

sr_start_probabilities_es = pd.Series(sr_start_probabilities_es)
df_transition_es = pd.DataFrame(df_transition_es).T
df_emission_es = pd.read_csv('Data/ES/csv_dev_out_es_test_e_x_y.csv')
df_emission_es = df_emission_es.drop_duplicates(subset=['x','y'], keep='last')
df_emission_es = df_emission_es.pivot(index='y', columns='x', values='e(x|y)').fillna(0)
display(df_emission_es)

sr_start_probabilities_ru = pd.Series(sr_start_probabilities_ru)
df_transition_ru = pd.DataFrame(df_transition_ru).T
df_emission_ru = pd.read_csv('Data/RU/csv_dev_out_ru_test_e_x_y.csv')
df_emission_ru = df_emission_ru.drop_duplicates(subset=['x','y'], keep='last')
df_emission_ru = df_emission_ru.pivot(index='y', columns='x', values='e(x|y)').fillna(0)
display(df_emission_ru)

In [None]:
def modified_viterbi(observation, transition, emission, start_probabilities, kth_best):
    k_best = math.ceil(kth_best/3)
    states = transition.index.tolist()
    step_count = len(observation)

    preceding = {}

    preceding = {state: [(0, [])] * k_best for state in states}

    for state, sequences in preceding.items():
        
        try:
            preceding[state][0] = (emission[observation[0]][state] * start_probabilities[state], [state])
        except:
            preceding[state][0] = (emission["#UNK#"][state] * start_probabilities[state], [state])

    for step in range(1, step_count):

        # refresh current at the beginning of each step
        current = {state: [(0, [])] * k_best for state in states}

        for current_state in states:

            for previous_state in states:

                for sequence in preceding[previous_state]:

                    # getting the transition probability from preceding state to current state from one of the tuples in the "preceding" table
                    prev_probability = sequence[0]

                    try:
                        emission_param = emission.loc[current_state, observation[step]]
                    except:
                        emission_param = emission.loc[current_state, "#UNK#"]
                    prev_to_cur_probability = prev_probability * transition.loc[previous_state, current_state] * emission_param

                    # sort the tuples in ascending order so that the first tuple has lowest probability
                    current[current_state] = sorted(current[current_state], key=lambda x: x[0])
                    
                    if prev_to_cur_probability >= current[current_state][0][0]:
                        sequence_list = copy.deepcopy(sequence[1])
                        sequence_list.append(current_state)
                        current[current_state][0] = (prev_to_cur_probability, sequence_list)
        
        # we are either entering the next step or leaving the loop, so preceding becomes current
        preceding = copy.deepcopy(current)

    combined_list = []
    for sequences in preceding.values():
        combined_list.extend(sequences)

    combined_list = sorted(combined_list, key=lambda x: x[0])[::-1]

    return combined_list[kth_best-1]

In [3]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')
    return [sentence.split('\n') for sentence in data]

es_observations = load_data('Data/ES/dev.in')
ru_observations = load_data('Data/RU/dev.in')

predicted_tags_viterbi_ES2 = [modified_viterbi([word.split()[0] for word in sentence], df_transition_es, df_emission_es, sr_start_probabilities_es, 2)[1] for sentence in es_observations]
predicted_tags_viterbi_ES8 = [modified_viterbi([word.split()[0] for word in sentence], df_transition_es, df_emission_es, sr_start_probabilities_es, 8)[1] for sentence in es_observations]
predicted_tags_viterbi_RU2 = [modified_viterbi([word.split()[0] for word in sentence], df_transition_ru, df_emission_ru, sr_start_probabilities_ru, 2)[1] for sentence in ru_observations]
predicted_tags_viterbi_RU8 = [modified_viterbi([word.split()[0] for word in sentence], df_transition_ru, df_emission_ru, sr_start_probabilities_ru, 8)[1] for sentence in ru_observations]

predicted_ES2 = []
for s in range(len(predicted_tags_viterbi_ES2)):
    for i in range(len(predicted_tags_viterbi_ES2[s])):
        predicted_ES2.append(es_observations[s][i] + " "+ predicted_tags_viterbi_ES2[s][i])
    predicted_ES2.append('\n')

predicted_ES8 = []
for s in range(len(predicted_tags_viterbi_ES8)):
    for i in range(len(predicted_tags_viterbi_ES8[s])):
        predicted_ES8.append(es_observations[s][i] + " "+ predicted_tags_viterbi_ES8[s][i])
    predicted_ES2.append('\n')

predicted_RU2 = []
for s in range(len(predicted_tags_viterbi_RU2)):
    for i in range(len(predicted_tags_viterbi_RU2[s])):
        predicted_RU2.append(ru_observations[s][i] + " "+ predicted_tags_viterbi_RU2[s][i])
    predicted_RU2.append('\n')

predicted_RU8 = []
for s in range(len(predicted_tags_viterbi_RU8)):
    for i in range(len(predicted_tags_viterbi_RU8[s])):
        predicted_RU8.append(ru_observations[s][i] + " "+ predicted_tags_viterbi_RU8[s][i])
    predicted_ES8.append('\n')

with open('Data/ES/dev.p3.2nd.out', 'w', encoding='utf-8') as file:
    # Write each modified line back to the output file
    for line in predicted_ES2:
        if line != "\n":
            file.write(line + '\n')
        else:
            file.write(line)

with open('Data/ES/dev.p3.8th.out', 'w', encoding='utf-8') as file:
    # Write each modified line back to the output file
    for line in predicted_ES8:
        if line != "\n":
            file.write(line + '\n')
        else:
            file.write(line)

with open('Data/RU/dev.p3.2nd.out', 'w', encoding='utf-8') as file:
    # Write each modified line back to the output file
    for line in predicted_RU2:
        if line != "\n":
            file.write(line + '\n')
        else:
            file.write(line)

with open('Data/RU/dev.p3.8th.out', 'w', encoding='utf-8') as file:
    # Write each modified line back to the output file
    for line in predicted_RU8:
        if line != "\n":
            file.write(line + '\n')
        else:
            file.write(line)

FileNotFoundError: [Errno 2] No such file or directory: 'Data/ES/dev.in'