In [17]:
import numpy as np
import pandas as pd
import matplotlib as plt

IMPORTING THE DATA

In [18]:
training_data = pd.read_csv('Corona_train.csv').to_numpy().T

m = training_data.shape[1]

training_labels = training_data[1].reshape((1, m))
training_documents = training_data[2].reshape((1, m))

CREATING THE VOCABULARY

In [19]:
def create_vocabulary(training_labels, training_documents):
    vocabulary = []
    frequency = []    
    label_freq = [0, 0, 0]
    total_label_len = [0, 0, 0]
    vocabulary_dict = {}
    labels_dict = {"Positive" : 0, "Negative": 1, "Neutral": 2}

    for i in range(m):
        label = training_labels[0][i]
        doc = training_documents[0][i]
        words = doc.split()
        
        idx = labels_dict[label]
        label_freq[idx] += 1
        total_label_len[idx] += len(words)
        
        for word in words:
            if word in vocabulary_dict.keys():
                word_idx = vocabulary_dict[word]
                frequency[word_idx][idx] += 1
            else:
                vocabulary.append(word)
                frequency.append([0, 0, 0])
                vocabulary_dict[word] = len(vocabulary)-1
                word_idx = vocabulary_dict[word]
                frequency[word_idx][idx] += 1
                
    return (vocabulary, frequency, label_freq, total_label_len, vocabulary_dict)

NAIVE BAYES

In [20]:
def naive_bayes(vocabulary, frequency, label_freq, total_label_len, training_labels):
    
    m = training_labels.shape[1]
    words = len(vocabulary)

    phi_positive = label_freq[0]/m
    phi_negative = label_freq[1]/m
    phi_neutral = label_freq[2]/m

    phi_params = np.array([[phi_positive, phi_negative, phi_neutral]])

    theta_pos, theta_neg, theta_neutral = [], [], []

    for i in range(words):
        freq = frequency[i]
        pos_param = (freq[0] + 1) / (total_label_len[0] + words)
        neg_param = (freq[1] + 1) / (total_label_len[1] + words)
        neutral_param = (freq[2] + 1) / (total_label_len[2] + words)
        theta_pos.append(pos_param)
        theta_neg.append(neg_param)
        theta_neutral.append(neutral_param)
    
    theta_pos_params = np.array(theta_pos).reshape((1, words))
    theta_neg_params = np.array(theta_neg).reshape((1, words))
    theta_neutral_params = np.array(theta_neutral).reshape((1, words))

    return (phi_params, theta_pos_params, theta_neg_params, theta_neutral_params)


INFERENCE

In [29]:
def compute_x_given_y(document, vocabulary_dict, theta_params):
    log_prob = 0
    words = document.split()
    
    for word in words:
        if word in vocabulary_dict.keys():
            word_idx = vocabulary_dict[word]
            log_prob += np.log(theta_params[0][word_idx])

    return log_prob
   
def get_prediction(training_documents, vocabulary_dict, phi_params, theta_pos_params, theta_neg_params, theta_neutral_params):
    number_docs = training_documents.shape[1]
    predictions = []
    for i in range(number_docs):
        doc = training_documents[0][i]

        prob_pos = compute_x_given_y(doc, vocabulary_dict, theta_pos_params) + np.log(phi_params[0][0])
        prob_neg = compute_x_given_y(doc, vocabulary_dict, theta_neg_params) + np.log(phi_params[0][1])
        prob_neutral = compute_x_given_y(doc, vocabulary_dict, theta_neutral_params) + np.log(phi_params[0][2])

        max_value = max(prob_pos, prob_neg, prob_neutral)

        if max_value == prob_pos:
            predictions.append("Positive")
        elif max_value == prob_neg:
            predictions.append("Negative")
        else:
            predictions.append("Neutral")
    
    return predictions

In [31]:
vocabulary, frequency, label_freq, total_label_len, vocabulary_dict = create_vocabulary(training_labels, training_documents)

phi_params, theta_pos_params, theta_neg_params, theta_neutral_params = naive_bayes(vocabulary, frequency, label_freq, total_label_len, training_labels)

predictions = get_prediction(training_documents, vocabulary_dict, phi_params, theta_pos_params, theta_neg_params, theta_neutral_params)

ans=0
for i in range(len(predictions)):
    if predictions[i] == training_labels[0][i]:
        ans+=1
ans/len(predictions)

0.8504648214663004

In [34]:
test_data = pd.read_csv('Corona_validation.csv').to_numpy().T

n = test_data.shape[1]

test_labels = test_data[1].reshape((1, n))
test_documents = test_data[2].reshape((1, n))

predictions = get_prediction(test_documents, vocabulary_dict, phi_params, theta_pos_params, theta_neg_params, theta_neutral_params)

ans=0
for i in range(len(predictions)):
    if predictions[i] == test_labels[0][i]:
        ans+=1
ans/len(predictions)



0.6705132098390525

In [2]:
float('-inf')*(-1)

inf