In [2]:
import pandas as pd
import numpy as np
import ast  # For converting string representations of lists to actual lists

def parse_tagged_sentences(df):
    # Convert string representations of lists to actual lists
    df['tagged_sentence'] = df['tagged_sentence'].apply(ast.literal_eval)
    
    all_words = []
    all_tags = []
    for index, row in df.iterrows():
        sentence_words, sentence_tags = zip(*row['tagged_sentence'])  # Unzipping words and tags
        all_words.extend(sentence_words)
        all_tags.extend(sentence_tags)
    return all_words, all_tags

def train_hmm(words, tags):
    unique_words = list(set(words))
    unique_tags = list(set(tags))
    
    # Initialize counts
    tag_counts = {tag: 0 for tag in unique_tags}
    transition_counts = {tag: {t: 0 for t in unique_tags} for tag in unique_tags}
    emission_counts = {tag: {word: 0 for word in unique_words} for tag in unique_tags}
    
    # Count initial tag frequencies
    initial_tag_counts = {tag: 0 for tag in unique_tags}
    initial_tag_counts[tags[0]] += 1  # Assuming the first tag of the list is the start of a sentence
    
    # Update counts based on the dataset
    for i in range(len(tags) - 1):
        current_tag, next_tag = tags[i], tags[i + 1]
        word = words[i]
        tag_counts[current_tag] += 1
        transition_counts[current_tag][next_tag] += 1
        emission_counts[current_tag][word] += 1
        
    # Convert counts to probabilities
    transition_prob = {
        current_tag: {next_tag: transition_counts[current_tag][next_tag] / tag_counts[current_tag]
                      for next_tag in unique_tags} for current_tag in unique_tags
    }
    
    emission_prob = {
        tag: {word: (emission_counts[tag][word] / tag_counts[tag] if tag_counts[tag] > 0 else 0)
              for word in unique_words} for tag in unique_tags
    }
    
    initial_prob = {tag: initial_tag_counts[tag] / sum(initial_tag_counts.values()) for tag in unique_tags}
    
    return unique_tags, unique_words, transition_prob, emission_prob, initial_prob

def viterbi_algorithm(sentence, unique_tags, transition_prob, emission_prob, initial_prob):
    states = np.zeros((len(unique_tags), len(sentence)))
    backpointer = np.zeros((len(unique_tags), len(sentence)), dtype=int)
    
    # Initialize with initial probabilities
    for i, tag in enumerate(unique_tags):
        states[i, 0] = initial_prob[tag] * emission_prob[tag].get(sentence[0], 0)
    
    # Populate the rest of the states
    for t in range(1, len(sentence)):
        for i, tag in enumerate(unique_tags):
            max_prob = max(states[j, t-1] * transition_prob[unique_tags[j]][tag] * emission_prob[tag].get(sentence[t], 0) 
                           for j in range(len(unique_tags)))
            states[i, t] = max_prob
            backpointer[i, t] = np.argmax([states[j, t-1] * transition_prob[unique_tags[j]][tag] for j in range(len(unique_tags))])
    
    # Backtracking
    best_path = [np.argmax(states[:, len(sentence) - 1])]
    for t in range(len(sentence) - 1, 0, -1):
        best_path.insert(0, backpointer[best_path[0], t])
    
    return [unique_tags[index] for index in best_path]

def main():
    train_filepath = 'train.csv'
    df = pd.read_csv(train_filepath)
    words, tags = parse_tagged_sentences(df)
    unique_tags, unique_words, transition_prob, emission_prob, initial_prob = train_hmm(words, tags)
    
    # Example: Predict the POS tags for a new sentence
    # You will need to preprocess the sentence similar to your training data before prediction
    sentence = ['Mary', 'is', 'an', 'example', 'sentence']
    predicted_tags = viterbi_algorithm(sentence, unique_tags, transition_prob, emission_prob, initial_prob)
    
    print(predicted_tags)

if __name__ == "__main__":
    main()


['AT', 'AT', 'AT', 'AT', 'AT']
