In [None]:
import pandas as pd
import numpy as np

def parse(text, stop_words):
    words = []
    for word in text.split():
        if word not in stop_words:
            words.append(word)

    combine_words = []
    for i in range(len(words) - 1):
        first = words[i]
        second = words[i + 1]
        combine_word = first + '_' + second
        combine_words.append(combine_word)

    return words + combine_words

def k_fold(dataset, k):
    dataset = dataset.sample(frac=1, random_state=50).reset_index(drop=True)
    folds = []

    for i in range(k):
        starting_index = i * (len(dataset) // k)
        if i == k - 1:
            ending_index = len(dataset)
        else:
            ending_index = (i + 1) * (len(dataset) // k)

        test_fold_portion = dataset[starting_index:ending_index].reset_index(drop=True)
        train_fold_front = dataset[:starting_index]
        train_fold_back = dataset[ending_index:]
        train_fold_portion = pd.concat([train_fold_front, train_fold_back]).reset_index(drop=True)
        folds.append((train_fold_portion, test_fold_portion))

    return folds

def improved_naive_bayes(dataset, k_folds=10):
    labels = dataset['Class'].unique()
    label_counts_total = {label: 0 for label in labels}
    label_word_counts_total = {label: {} for label in labels}
    words = set()
    stop_words = {
        'a', 'an', 'and', 'the', 'is', 'in', 'at', 'of', 'on', 'for', 'to', 'with', 'by', 'this', 
        'that', 'it', 'as', 'are', 'be', 'from', 'or', 'was', 'were', 'but', 'not', 'have', 
        'has', 'had'
    }

    # Word frequencies per fold
    folds = k_fold(dataset, k_folds)  
    for fold in folds:
        train_fold = fold[0]
        for i in range(len(train_fold)):
            label = train_fold['Class'].iloc[i]
            tokens = parse(train_fold['Description'].iloc[i], stop_words) # removes stop words and bigrams
            label_counts_total[label] += 1

            for token in tokens:
                words.add(token)
                if token not in label_word_counts_total[label]:
                    label_word_counts_total[label][token] = 0
                else:
                    label_word_counts_total[label][token] += 1

    words = list(words)

    # Prior probabilities for each class
    prior_probability = {}
    for label in labels:
        prior_probability[label] = np.log(label_counts_total[label] / sum(label_counts_total.values()))

    # Conditional probabilities for each word given a class
    word_probabilities = {label: {} for label in labels}
    for label in labels:
        total_words = sum(label_word_counts_total[label].values())
        for word in words:
            count = label_word_counts_total[label].get(word, 0)
            word_probabilities[label][word] = np.log((count + 1) / (total_words + len(words)))

    return labels, label_word_counts_total, words, prior_probability, word_probabilities

def predict(test, labels, label_word_counts, words, prior_probability, word_probabilities, filename='result.csv'):
    predictions = []
    words = set(words)
    words_size = len(words)

    # Predict label for each example
    for i in range(len(test)):
        id = test['Id'].iloc[i]
        tokens = test['Description'].iloc[i].split()
        probability = {}

        # Log-probability of each label given the tokens
        for label in labels:
            score = prior_probability[label]
            total_words = sum(label_word_counts[label].values())
            for word in tokens:
                if word in words:
                    score += word_probabilities[label].get(word, np.log(1 / (total_words + words_size)))
            probability[label] = score

        # Label with highest log probability
        highest_score = 0
        predicted_label = 0

        for label in probability:
            score = probability[label]
            if highest_score == 0 or score > highest_score:
                highest_score = score
                predicted_label = label

        predictions.append((id, predicted_label))

    # Save to .csv and return prediction result
    pd.DataFrame(predictions, columns=['Id', 'Class']).to_csv(filename, index=False)
    return predictions

def confusion_matrix(dataset, labels, predictions):
    TP = {label: 0 for label in labels}
    FP = {label: 0 for label in labels}
    FN = {label: 0 for label in labels}
    TN = {label: 0 for label in labels}

    # Recording predicted data to actual data
    prediction_dictionary = dict(predictions)
    for i in range(len(dataset)):
        Id = dataset['Id'].iloc[i]
        predict_label = prediction_dictionary.get(Id)
        actual_label = dataset['Class'].iloc[i]

        for label in labels:
            TP[label] += (predict_label == label and actual_label == label)
            FP[label] += (predict_label == label and actual_label != label)
            FN[label] += (predict_label != label and actual_label == label)
            TN[label] += (predict_label != label and actual_label != label)

    # Output evaluation result
    for label in labels:
        tp = TP[label]
        fp = FP[label]
        fn = FN[label]
        tn = TN[label]

        accuracy = (tp + tn) / (tp + fp + fn + tn) 
        precision = tp / (tp + fp) 
        recall = tp / (tp + fn)
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print(f'{label}:   Accuracy: {accuracy * 100:05.2f}%   Precision: {precision * 100:05.2f}%   Recall: {recall * 100:05.2f}%   F1: {f1 * 100:05.2f}%')

# Loading data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Train and test both model
print('Model Improved Naive Bayes')
labels, label_word_counts, words, prior_probability, word_probabilities = improved_naive_bayes(train)
predictions = predict(test, labels, label_word_counts, words, prior_probability, word_probabilities)
confusion_matrix(train, labels, predictions) # Evaluate model performance

Model Improved Naive Bayes
W:   Accuracy: 48.45%   Precision: 53.10%   Recall: 13.03%   F1: 20.92%
A:   Accuracy: 58.41%   Precision: 38.66%   Recall: 09.64%   F1: 15.43%
S:   Accuracy: 96.16%   Precision: 05.00%   Recall: 01.50%   F1: 02.31%
G:   Accuracy: 93.30%   Precision: 00.00%   Recall: 00.00%   F1: 00.00%
