### Module Handling

In [53]:
import pandas as pd
import numpy as np
import os
import subprocess

### Data Processing

In [54]:
def text_to_dataframe(text):
    text = text.strip().split('\n')
    data = [line.rsplit(" ", 1) for line in text if line.strip() != ""]
    df = pd.DataFrame(data, columns=['x', 'y'])
    return df

In [55]:
def vocabulary(df):
    vocabulary = df['x'].unique()
    return vocabulary

### Naive Bayes

#### Laplace Smoothing 
In order to solve the a feature (word) giving zero probability (exists in test but not training set), we use Laplace Smoothing. 

Example: P(x’/positive)= (number of reviews with x’ and target_outcome=positive + α) / (N+ α*k) 

In this function, we let α=1. This ensures that the posterior probability comes out to 1/N+k rather than zero.

https://www.cs.rhodes.edu/~kirlinp/courses/ai/f18/projects/proj3/naive-bayes-log-probs.pdf

In [56]:
def main(train_path,file_in,file_out):
    file_path = os.getcwd()

    f = open(f"{file_path}/{train_path}", "r")
    raw_train = f.read()
    data = text_to_dataframe(raw_train)
    vocab = vocabulary(data)

    # Calculate prior class probabilities P(label)
    label_counts = data['y'].value_counts().to_dict()
    total_samples = data.shape[0]
    prior_probs = {label: count / total_samples for label, count in label_counts.items()}

    df =  data.groupby(['x', 'y']).size().reset_index(name='count')

    # Calculate likelihoods P(word | label)
    likelihoods = {}
    for label, count in label_counts.items():
        likelihoods[label] = {}

        for word in vocab:
            # Calculate the count of occurrences of the current word with the current label
            filtered_data = df[(df['x'] == word) & (df['y'] == label)]
            try:
                word_label_count = filtered_data['count'].iat[0]
            except:
                word_label_count = 0

            # Apply Laplace smoothing where alpha = 1
            smoothed_likelihood = (word_label_count + 1) / (count + len(vocab))

            likelihoods[label][word] = smoothed_likelihood


    # Test sentence
    with open(f"{file_path}/{file_in}", "r") as input_file:
        data_devin = input_file.readlines()

    # Cleaned data_devin
    sequence = [line.strip() for line in data_devin]

    # Perform Naive Bayes on test sentence
    predictions = []

    for word in sequence:
        if word != '':
            # simplified from: 
            # word_probs = {}
            
            # for label, count in label_counts.items():
            #     if word in likelihoods[label]:
            #         word_prob = np.log(prior_probs[label]) + np.log(likelihoods[label][word])
            #     else:
            #         word_prob = np.log(prior_probs[label]) + np.log(1 / (count + len(vocab)))
            #     word_probs[label] = word_prob
            
            word_probs = {label: np.log(prior_probs[label]) + np.log(likelihoods[label].get(word, 1 / (count + len(vocab))))
                          for label, count in label_counts.items()}

            predicted_label = max(word_probs, key=word_probs.get)
            predictions.append(predicted_label)
        else:
            predictions.append('')

    # Write the output to dev.p1.out file
    with open(f"{file_path}/{file_out}", 'w') as output_file:
        for word, sentiment in zip(data_devin, predictions):
            output_file.write(f"{word.strip()} {sentiment}\n")

In [48]:
print('Processing RU')
main("Data/RU/train", "Data/RU/dev.in","Data/RU/dev.p4b.out")
print("done!")

Processing RU
done!


In [49]:
print('Processing ES')
main("Data/ES/train", "Data/ES/dev.in","Data/ES/dev.p4b.out")
print("done!")

Processing ES
done!


### Precision, recall and F scores 

#### New Model

In [57]:
# language 
languages = ['ES','RU']
# Get the current working directory
current_dir = os.getcwd()

for language in languages:
    print(f"For Language: {language}")
    # Command to run
    command = ["python3", f"{current_dir}/EvalScript/evalResult.py", f"{file_path}/Data/{language}/dev.out", f"{file_path}/Data/{language}/dev.p4b.out"]
    # Run the command
    result = subprocess.run(command, capture_output=True, text=True)
    print(result.stdout)

For Language: ES

#Entity in gold data: 229
#Entity in prediction: 95

#Correct Entity : 75
Entity  precision: 0.7895
Entity  recall: 0.3275
Entity  F: 0.4630

#Correct Sentiment : 61
Sentiment  precision: 0.6421
Sentiment  recall: 0.2664
Sentiment  F: 0.3765

For Language: RU

#Entity in gold data: 389
#Entity in prediction: 89

#Correct Entity : 77
Entity  precision: 0.8652
Entity  recall: 0.1979
Entity  F: 0.3222

#Correct Sentiment : 54
Sentiment  precision: 0.6067
Sentiment  recall: 0.1388
Sentiment  F: 0.2259



### Test Set

In [59]:
print('Processing RU')
main("Data/RU/train", "Data/RU/test.in","Data/RU/test.p4b.out")
print("done!")

Processing RU
done!


In [60]:
print('Processing ES')
main("Data/ES/train", "Data/ES/test.in","Data/ES/test.p4b.out")
print("done!")

Processing ES
done!
