In [55]:
def read_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        file_content = file.read()
    return file_content

def write_file(file_path, content):
    with open(file_path, 'w', encoding="utf-8") as file:
        file.write(content)

In [71]:
def check(input_string, labels_set):
    for label in labels_set:
        if label in input_string:
            return 
    return True

def estimate_emission_parameters(file_content):
    tags = {}  # Dictionary to store counts of each observation
    emission_params = {}  # Dictionary to store emission probabilities

    # Split the file content into lines
    lines = file_content.strip().split("\n")

    # Iterate through each line and extract observations and tags
    for line in lines:
        if len(line) == 0:
            continue
        if ' O' in line:
            observation, tag = line[:-1], line[-1]
        elif ('B-positive' in line 
            or 'I-positive' in line 
            or 'B-negative' in line 
            or 'I-negative' in line):
            observation, tag = line[:-10], line[-10:]
        elif ('B-neutral' in line
            or 'I-neutral' in line):
            observation, tag = line[:-9], line[-9:]
#         observation, tag = line.split()
    
        if tag in tags:
            tags[tag][observation] = tags[tag].get(observation, 0) + 1
        else:
            tags[tag] = {observation: 1}

    # Calculate emission probabilities for each observation and tag
    for tag, observations_count in tags.items():
        total_count = sum(observations_count.values())
        emission_params[tag] = {
            observation: count / total_count for observation, count in observations_count.items()
        }

    return emission_params

file_content = read_file("ES/train.txt")

emission_parameters = estimate_emission_parameters(file_content)
# print(emission_parameters, sum(len(words) for words in emission_parameters.values()))

In [101]:
def estimate_emission_parameters_modified(training_file_content, test_file_content, k=1):
    tags = {}  # Dictionary to store counts of each observation
    emission_params = {}  # Dictionary to store emission probabilities
    train_words = []
    test_words = []
    
    # Split the file content into lines
    train_data_lines = training_file_content.strip().split("\n")
    test_data_lines = test_file_content.strip().split("\n")

    # Iterate through each line and extract observations and tags
    for line in train_data_lines:
        if len(line) == 0:
            continue
        if ' O' in line:
            observation, tag = line[:-2], line[-1]
        elif ('B-positive' in line 
            or 'I-positive' in line 
            or 'B-negative' in line 
            or 'I-negative' in line):
            observation, tag = line[:-11], line[-10:]
        elif ('B-neutral' in line
            or 'I-neutral' in line):
            observation, tag = line[:-10], line[-9:]
        if observation not in train_words:
            train_words.append(observation)
        if tag in tags:
            tags[tag][observation] = tags[tag].get(observation, 0) + 1
        else:
            tags[tag] = {observation: 1}
    # Iterate through each line to extract observations from test set
    for line in test_data_lines:
        if len(line) == 0:
            continue
        if ' O' in line:
            observation, tag = line[:-2], line[-1]
        elif ('B-positive' in line 
            or 'I-positive' in line 
            or 'B-negative' in line 
            or 'I-negative' in line):
            observation, tag = line[:-11], line[-10:]
        elif ('B-neutral' in line
            or 'I-neutral' in line):
            observation, tag = line[:-10], line[-9:]
        else:
            observation = line
        if observation not in test_words:
            test_words.append(observation)
    # Extract words that are in test set but not in train set   
    unique_words = find_unique_words(test_words, train_words)

    # Calculate emission probabilities for each observation and tag
    for tag, observations_count in tags.items():
        total_count = sum(observations_count.values())
        emission_params[tag] = {
            observation: count / (total_count + k) for observation, count in observations_count.items()
        }
        for word in unique_words:
            emission_params[tag][word] = k / (total_count + k)


    return emission_params, list(set(test_words).union(train_words))

def find_unique_words(list1, list2):
    set1 = set(list1)
    set2 = set(list2)

    unique_in_list1 = set1 - set2

    return list(unique_in_list1)


training_file_content = read_file("ES/train.txt")
test_file_content = read_file("ES/dev.in")
emission_parameters = estimate_emission_parameters_modified(training_file_content, test_file_content, 1)[0]
print(emission_parameters, sum(len(words) for words in emission_parameters.values()))
count = 0
for emission_probabilities in emission_parameters.values():
    count += len(emission_probabilities)
# print(count)

{'O': {'Estuvimos': 0.00020664003306240529, 'hace': 0.0008954401432704229, 'poco': 0.0018942003030720485, 'mi': 0.0024796803967488635, 'pareja': 0.00044772007163521146, 'y': 0.0352665656426505, 'yo': 0.0012398401983744318, 'comiendo': 0.00034440005510400884, 'resultó': 0.00013776002204160352, 'todo': 0.0039606006336961016, 'muy': 0.013638242182118749, 'bien': 0.005682600909216145, ',': 0.05730816916930707, 'tanto': 0.0013431602149056344, 'la': 0.026002204160352666, 'el': 0.022110483537677365, '…': 0.0015498002479680396, 'nos': 0.005028240804518529, 'gustó': 0.0003788400606144097, 'mucho': 0.0018253202920512468, '.': 0.055896128943380634, 'Por': 0.0010332001653120264, 'poner': 0.00017220002755200442, 'algún': 0.00020664003306240529, 'pero': 0.006578041052486569, 'quizá': 0.00013776002204160352, 'no': 0.012708362033337925, 'era': 0.0017908802865408459, 'lo': 0.009298801487808239, '"': 0.0013431602149056344, 'ibérico': 6.888001102080176e-05, 'que': 0.029101804656288744, 'cabía': 3.4440005

In [105]:
def sentiment_analysis(path_to_train_file, path_to_test_file):
    training_file_content = read_file(path_to_train_file)
    test_file_content = read_file(path_to_test_file)
    emission_parameters, words = estimate_emission_parameters_modified(training_file_content, test_file_content)
    word_to_label = {}
    for word in words:
        probabilities = []
        for label, freqs in emission_parameters.items():
            if word in freqs:
                probabilities.append((label, freqs[word]))
        word_to_label[word] = max(probabilities, key=lambda x: x[1])[0]
    return word_to_label

def evaluate_system(word_to_label, path_to_dev_set, path_to_output):
    result = ""
    dev_file_content = read_file(path_to_dev_set)
    dev_set_lines = dev_file_content.strip().split("\n")
    counter = 0
    for line in dev_set_lines:
        if line == "":
            result += "\n"
            continue  
        result += line + " " + word_to_label.get(line) + "\n"
 
    write_file(path_to_output, result)        

In [106]:
mapping = sentiment_analysis("ES/train.txt", "ES/dev.in")
evaluate_system(mapping, "ES/dev.in", "ES/dev.p1.out")

In [107]:
mapping = sentiment_analysis("RU/train.txt", "RU/dev.in")
evaluate_system(mapping, "RU/dev.in", "RU/dev.p1.out")